In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

# Sentence Pair Classification with *ktrain*

This notebook demonstrates sentence pair classification with *ktrain*. 

## Download a Sentence Pair Classification Dataset

In this notebook, we will use the Microsoft Research Paraphrase Corpus (MRPC) to build a model that can detect pairs of sentences that are paraphrases of one another.  The MRPC train and test datasets can be downloaded from here:
- [MRPC train dataset](https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt)
- [MRPC test dataset](https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt)

Once downloaded, we will prepare the datasets as arrays of sentence pairs.

In [2]:
import pandas as pd
import csv
TRAIN = 'data/mrpc/msr_paraphrase_train.txt'
TEST = 'data/mrpc/msr_paraphrase_test.txt'
train_df = pd.read_csv(TRAIN, delimiter='\t', quoting=csv.QUOTE_NONE)
test_df = pd.read_csv(TEST, delimiter='\t', quoting=csv.QUOTE_NONE)
x_train = train_df[['#1 String', '#2 String']].values
y_train = train_df['Quality'].values
x_test = test_df[['#1 String', '#2 String']].values
y_test = test_df['Quality'].values

## Build and Train a `BERT` Model

For demonstration purposes, we only train for 3 epochs.

In [3]:
import ktrain
from ktrain import text
MODEL_NAME = 'bert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=128, class_names=['not paraphrase', 'paraphrase'])
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) # lower bs if OOM occurs
learner.fit_onecycle(5e-5, 3)

preprocessing train...
language: en


preprocessing test...
language: en




begin training using onecycle policy with max lr of 5e-05...
Train for 128 steps, validate for 54 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fad507f32b0>

## Make Predictions

In [4]:
predictor = ktrain.get_predictor(learner.model, t)

In [5]:
predictor.predict(('Barack Obama was US President in 2011', 'In 2011, Barack Obama was US Presdient'))

'paraphrase'

In [6]:
predictor.predict(('Donald Trump was US President in 2016', 'In 2011, Barack Obama was US Presdient'))

'not paraphrase'

In [7]:
predictor.save('/tmp/mrpc_model')

In [8]:
p = ktrain.load_predictor('/tmp/mrpc_model')

In [9]:
p.predict(('Barack Obama was US President in 2011', 'In 2011, Barack Obama was US Presdient'), return_proba=True)

array([0.01205321, 0.9879468 ], dtype=float32)