In [1]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *

In [2]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
data = tensorflow_datasets.load('glue/mrpc')

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (/root/tensorflow_datasets/glue/mrpc/0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from /root/tensorflow_datasets/glue/mrpc/0.0.2


In [5]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')

In [None]:
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

In [6]:
for x,y in train_dataset:
    print(x,y)

 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, 'token_type_ids': <tf.Tensor: id=54614, shape=(128,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>} tf.Tensor(1, shape=(), dtype=int64)
{'input_ids': <tf.Tensor: id=54617, shape=(128,), dtype=int32, numpy=
array([  101,  1247,  1132,  1178,   123,   117,  1288,  2264, 10687,
        1690,  1107, 18393,  3174, 14557,  1968,  1208,   119,   102,
        1247,  1132,  1198,   170,  8973,  1104, 10687,  1286,  1107,
       18393,  3174, 14557,  1968,   119,   102,     0,     0,     0,
           0,     0,     0,     0,     0,    

In [6]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [7]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

Train for 115 steps, validate for 7 steps
Epoch 1/2
Epoch 2/2


In [18]:
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His crap were compatible with this research."
sentence_2 = "His shit were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='tf')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='tf')

In [11]:
inputs_1

{'input_ids': <tf.Tensor: id=59172, shape=(1, 19), dtype=int32, numpy=
 array([[  101,  1188,  1844,  1108,  8080,  1114,  1117,  9505,   119,
           102,  1230,  9505,  1127, 12173,  1114,  1142,  1844,   119,
           102]], dtype=int32)>,
 'token_type_ids': <tf.Tensor: id=59173, shape=(1, 19), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       dtype=int32)>}

In [21]:
pred_1 = model(inputs_1)
pred_2 = model(inputs_2)

print(pred_1)
print(pred_2)

(<tf.Tensor: id=82355, shape=(1, 2), dtype=float32, numpy=array([[-2.179758 ,  1.5462991]], dtype=float32)>,)
(<tf.Tensor: id=84286, shape=(1, 2), dtype=float32, numpy=array([[-0.96036386,  0.80830616]], dtype=float32)>,)
