In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [8]:
!pip install -q tf-models-official==2.4.0

[K     |████████████████████████████████| 1.1 MB 8.0 MB/s 
[K     |████████████████████████████████| 596 kB 24.3 MB/s 
[K     |████████████████████████████████| 47.8 MB 1.2 MB/s 
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.3 MB/s 
[K     |████████████████████████████████| 352 kB 66.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 39.5 MB/s 
[K     |████████████████████████████████| 99 kB 6.6 MB/s 
[K     |████████████████████████████████| 237 kB 65.3 MB/s 
[K     |████████████████████████████████| 462 kB 59.3 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [37]:
import os

import tensorflow as tf
from official.nlp import bert
import official.nlp.bert.tokenization
import tensorflow_datasets as tfds
import numpy as np


# Load the required submodules
import official.nlp.bert.bert_models
import official.nlp.bert.configs


In [11]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"

In [21]:
# GLUE, the General Language Understanding Evaluation benchmark (https://gluebenchmark.com/) 
# is a collection of resources for training, evaluating, and analyzing natural language understanding systems.

glue, info = tfds.load('glue/mrpc', with_info=True,
                       # It's small, load the whole dataset
                       batch_size=-1)

[1mDownloading and preparing dataset glue/mrpc/1.0.0 (download: 1.43 MiB, generated: Unknown size, total: 1.43 MiB) to /root/tensorflow_datasets/glue/mrpc/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompleteLEHS2M/glue-train.tfrecord


  0%|          | 0/3668 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompleteLEHS2M/glue-validation.tfrecord


  0%|          | 0/408 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompleteLEHS2M/glue-test.tfrecord


  0%|          | 0/1725 [00:00<?, ? examples/s]

[1mDataset glue downloaded and prepared to /root/tensorflow_datasets/glue/mrpc/1.0.0. Subsequent calls will reuse this data.[0m
Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


In [22]:
glue_train = glue['train']

In [44]:
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [45]:
def encode_sentence(s, tokenizer):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
  num_examples = len(glue_dict["sentence1"])
  
  sentence1 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
      for s in np.array(glue_dict["sentence1"])])
  sentence2 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
       for s in np.array(glue_dict["sentence2"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [46]:
my_examples = bert_encode(
    glue_dict = {
        'sentence1':[
            'The rain in Spain falls mainly on the plain.',
            'Look I fine tuned BERT.'],
        'sentence2':[
            'It mostly rains on the flat lands of Spain.',
            'Is it working? This does not match.']
    },
    tokenizer=tokenizer)

In [56]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

_, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

In [47]:
export_dir='/gdrive/MyDrive/bert/saved_model'

reloaded = tf.saved_model.load(export_dir)

In [60]:
my_examples = bert_encode(
    glue_dict = {
        'sentence1':[
            'The rain in Spain falls mainly on the plain.',
            'Look I fine tuned BERT.'],
        'sentence2':[
            'It mostly rains on the flat lands of Spain.',
            'Is it working? This does not match.']
    },
    tokenizer=tokenizer)

reloaded_result = reloaded([my_examples['input_word_ids'],
                            my_examples['input_mask'],
                            my_examples['input_type_ids']], training=False)

print(reloaded_result.numpy())
print(tf.argmax(reloaded_result).numpy())


my_examples2 = bert_encode(
    glue_dict = {
        'sentence1':[
            'i like you.',
            'i like you.'],
        'sentence2':[
            'i hate you.',
            'i love you.']
    },
    tokenizer=tokenizer)

reloaded_result2 = reloaded([my_examples2['input_word_ids'],
                            my_examples2['input_mask'],
                            my_examples2['input_type_ids']], training=False)

print(reloaded_result2.numpy())
print(tf.argmax(reloaded_result2).numpy())


[[-3.3459415  2.9583561]
 [ 2.0417385 -1.9247899]]
[1 0]
[[ 1.5650622 -1.4885354]
 [-1.4331851  1.4530655]]
[0 1]
