<a href="https://colab.research.google.com/github/axe76/Transformer-Stuff/blob/main/BertSentEqui.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q tf-models-official

[K     |████████████████████████████████| 1.1MB 5.7MB/s 
[K     |████████████████████████████████| 706kB 21.2MB/s 
[K     |████████████████████████████████| 358kB 30.4MB/s 
[K     |████████████████████████████████| 1.2MB 30.1MB/s 
[K     |████████████████████████████████| 37.6MB 118kB/s 
[K     |████████████████████████████████| 102kB 10.7MB/s 
[K     |████████████████████████████████| 174kB 48.4MB/s 
[K     |████████████████████████████████| 645kB 44.5MB/s 
[K     |████████████████████████████████| 51kB 4.9MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
import os
import numpy as np
import official.nlp.optimization
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
from official import nlp
from official.nlp.bert import tokenization

In [4]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
def encode_sentence(s, tokenizer):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)


In [4]:
def bert_encode(glue_dict, tokenizer):
    sentence1 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence1"])])
    sentence2 = tf.ragged.constant([encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence2"])])
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    input_mask = tf.ones_like(input_word_ids).to_tensor()
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()
    #input_type_ids = tf.keras.preprocessing.sequence.pad_sequences(input_type_ids, padding="post")
    inputs = {
        'input_word_ids': input_word_ids.to_tensor(),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids}
    return inputs

In [5]:
max_seq_length = 128
input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
bert_inputs = {'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids}
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
pooled_output, _ = bert_layer([input_word_ids, input_mask, input_type_ids])

In [6]:
output = tf.keras.layers.Dropout(rate=0.2)(pooled_output)
initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)
bert_output = tf.keras.layers.Dense(2, kernel_initializer=initializer, name='output')(output)
model = tf.keras.models.Model(inputs=bert_inputs, outputs=bert_output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [7]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)






In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)


In [8]:
test_s = "Hello, my name is what"
test_tokens = list(tokenizer.tokenize(test_s))
test_tokens.append('[SEP]')
sent = tokenizer.convert_tokens_to_ids(test_tokens)


[7592, 1010, 2026, 2171, 2003, 2054, 102]

In [9]:
glue, info = tfds.load('glue/mrpc', with_info=True, batch_size=-1)
glue_train = bert_encode(glue['train'], tokenizer)
glue_train_labels = glue['train']['label']
glue_validation = bert_encode(glue['validation'], tokenizer)
glue_validation_labels = glue['validation']['label']

[1mDownloading and preparing dataset glue/mrpc/1.0.0 (download: 1.43 MiB, generated: Unknown size, total: 1.43 MiB) to /root/tensorflow_datasets/glue/mrpc/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incomplete3AUMM3/glue-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=3668.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incomplete3AUMM3/glue-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incomplete3AUMM3/glue-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1725.0), HTML(value='')))

[1mDataset glue downloaded and prepared to /root/tensorflow_datasets/glue/mrpc/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
epochs = 10
batch_size = 32
eval_batch_size = 32
train_data_size = len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)
optimizer = nlp.optimization.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.fit(
    glue_train, glue_train_labels,
    validation_data=(glue_validation, glue_validation_labels),
    batch_size=batch_size,
    validation_batch_size=eval_batch_size,
    epochs=epochs)
#model.save_weights("./weights.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [24]:
glue_train["input_type_ids"].shape


(3668, 103)