In [1]:
!pip install -q -U tensorflow-text==2.4.1
!pip install -q -U tf-models-official==2.4.0
!pip install -U tfds-nightly
!pip install -q -U tensorflow==2.4.1

Requirement already up-to-date: tfds-nightly in /usr/local/lib/python3.7/dist-packages (4.3.0.dev202106250106)


In [2]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')

In [3]:
import os

if 'COLAB_TPU_ADDR' in os.environ:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recommended.')

Using GPU


In [4]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

In [21]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'small_bert/bert_en_uncased_L-2_H-256_A-4'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_uncased_L-24_H-1024_A-16", "bert_en_wwm_uncased_L-24_H-1024_A-16", "bert_en_cased_L-12_H-768_A-12", "bert_en_cased_L-24_H-1024_A-16", "bert_en_wwm_cased_L-24_H-1024_A-16", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "albert_en_large", "albert_en_xlarge", "albert_en_xxlarge", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base", "talking-heads_large"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/3',
    'bert_en_wwm_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_en_cased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/3',
    'bert_en_wwm_cased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'albert_en_large':
        'https://tfhub.dev/tensorflow/albert_en_large/2',
    'albert_en_xlarge':
        'https://tfhub.dev/tensorflow/albert_en_xlarge/2',
    'albert_en_xxlarge':
        'https://tfhub.dev/tensorflow/albert_en_xxlarge/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
    'talking-heads_large':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_wwm_cased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'bert_en_cased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'bert_en_wwm_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'albert_en_large':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'albert_en_xlarge':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'albert_en_xxlarge':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_large':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print('BERT model selected           :', tfhub_handle_encoder)
print('Preprocessing model auto-selected:', tfhub_handle_preprocess)

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1
Preprocessing model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [5]:
def make_bert_preprocess_model(sentence_features, seq_length=256):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(tfhub_handle_preprocess)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [6]:
def build_classifier_model(num_classes = 2):
  inputs = dict(
      input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids'),
      input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask'),
      input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids'),
  )

  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
  net = encoder(inputs)['pooled_output']
  net = tf.keras.layers.Dropout(rate=0.1)(net)
  net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
  return tf.keras.Model(inputs, net, name='prediction')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pickle
with open("/content/drive/My Drive/name_matching_attempt/combined_negative_samples_argument_v1.pickle", "rb") as handle:
  combined_negative_samples_argument_v1 = pickle.load(handle)

In [9]:
import pickle
with open("/content/drive/My Drive/name_matching_attempt/combined_positive_samples_argument_v1_with_abbr.pickle", "rb") as handle:
  combined_positive_samples_argument_v1_with_abbr = pickle.load(handle)

In [10]:
len(combined_negative_samples_argument_v1)

370140

In [11]:
len(combined_positive_samples_argument_v1_with_abbr)

198053

In [12]:
import pandas as pd
pd.set_option('display.max_columns', 10000)
pd.set_option('max_colwidth', 10000)
pd.set_option("max_rows", 50000)

In [13]:
list_positive_left = combined_positive_samples_argument_v1_with_abbr['name_left'].tolist()
list_positive_right = combined_positive_samples_argument_v1_with_abbr['name_right'].tolist()
list_negative_left = combined_negative_samples_argument_v1['name_left'].tolist()
list_negative_right = combined_negative_samples_argument_v1['name_right'].tolist()

In [14]:
list_positive_pairs = [[left, right] for left, right in zip(list_positive_left, list_positive_right)]
list_negative_pairs = [[left, right] for left, right in zip(list_negative_left, list_negative_right)]

n_training_positive = int(len(list_positive_pairs) * 0.8)
n_training_negative = int(len(list_negative_pairs) * 0.8)

labels_training = [1 for _ in range(n_training_positive)] + [0 for _ in range(n_training_negative)]
samples_training = list_positive_pairs[: n_training_positive] + list_negative_pairs[:n_training_negative]

labels_validation = [1 for _ in range(len(list_positive_pairs[n_training_positive + 1:]))] + [0 for _ in range(len(list_negative_pairs[n_training_negative + 1:]))]
samples_validation = list_positive_pairs[n_training_positive + 1:] + list_negative_pairs[n_training_negative+1:]

In [15]:
len(samples_training)

454554

In [16]:
len(samples_validation)

113637

In [17]:
ds_training_dict = {
    'left_name': [sample[0] for sample in samples_training], 
    'right_name': [sample[1] for sample in samples_training],
    'label': labels_training,
    }
ds_validation_dict = {
    'left_name': [sample[0] for sample in samples_validation], 
    'right_name': [sample[1] for sample in samples_validation],
    'label': labels_validation,
    }

In [18]:
AUTOTUNE = tf.data.AUTOTUNE

def load_dataset(ds, is_training, batch_size, bert_preprocess_model):
  dataset = tf.data.Dataset.from_tensor_slices(ds)
  num_examples = len(ds['left_name'])
  
  dataset = dataset.shuffle(num_examples)
  if is_training:
    dataset = dataset.repeat()
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(lambda ex: (bert_preprocess_model(ex), ex['label']))
  dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
  return dataset, num_examples

In [19]:
def get_configuration():
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
  metrics = tf.keras.metrics.SparseCategoricalAccuracy(
        'accuracy', dtype=tf.float32)

  return metrics, loss

In [24]:
epochs = 2
batch_size = 512
init_lr = 1e-5
print(f'Fine tuning {tfhub_handle_encoder} model')
bert_preprocess_model = make_bert_preprocess_model(['left_name', 'right_name'])

with strategy.scope():

  # metric have to be created inside the strategy scope
  metrics, loss = get_configuration()

  train_dataset, train_data_size = load_dataset(ds_training_dict, True, batch_size, bert_preprocess_model)
  steps_per_epoch = train_data_size // batch_size
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = num_train_steps // 10

  validation_dataset, validation_data_size = load_dataset(ds_validation_dict, False, batch_size, bert_preprocess_model)
  validation_steps = validation_data_size // batch_size

  classifier_model = build_classifier_model()

  optimizer = optimization.create_optimizer(
      init_lr=init_lr,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      optimizer_type='adamw')

  classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])
  classifier_model.summary()
  classifier_model.fit(
      x=train_dataset, 
      validation_data=validation_dataset,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=validation_steps)

Fine tuning https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1 model


  [n for n in tensors.keys() if n not in ref_input_names])


Model: "prediction"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_mask (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_word_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder (KerasLayer)            {'encoder_outputs':  9591041     input_mask[0][0]                 
                                                                 input_type_ids[0][0]    

In [25]:
main_save_path = "/content/drive/My Drive/name_matching_attempt/name_matching_model_v1_r6_with_data_augmentation_text"
bert_type = tfhub_handle_encoder.split('/')[-2]
saved_model_name = f'{"name_matching_v1"}_{bert_type}'

saved_model_path = os.path.join(main_save_path, saved_model_name)

#preprocess_inputs = bert_preprocess_model.inputs
#bert_encoder_inputs = bert_preprocess_model(preprocess_inputs)
#bert_outputs = classifier_model(bert_encoder_inputs)
#model_for_export = tf.keras.Model(preprocess_inputs, bert_outputs)
model_for_export = classifier_model

print('Saving', saved_model_path)
model_for_export.summary()


Saving /content/drive/My Drive/name_matching_attempt/name_matching_model_v1_r6_with_data_augmentation_text/name_matching_v1_bert_en_uncased_L-2_H-256_A-4
Model: "prediction"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_mask (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_word_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder (KerasLayer)            {'

In [26]:
model_for_export.input

{'input_mask': <KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_mask')>,
 'input_type_ids': <KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_type_ids')>,
 'input_word_ids': <KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_word_ids')>}

In [27]:
model_for_export.output

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'classifier')>

In [28]:
model_for_export.save(saved_model_path)



# Test model

In [29]:
!pip install bert-for-tf2 # Just for tokenization, do not use it for model layer
from bert import bert_tokenization



In [30]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2", trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [31]:
do_lower_case

True

In [32]:
import numpy

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

def process_data(address_left, address_right, labels, max_seq_len = 256):
  assert len(address_left) == len(address_right)
  #assert len(address_right) == len(labels)
  input_id_list = []
  input_mask_list = []
  segment_id_list = []
  for example_index in range(len(address_left)):
      the_address_left_tokens = tokenizer.tokenize(address_left[example_index])
      the_address_right_tokens = tokenizer.tokenize(address_right[example_index])
      _truncate_seq_pair(the_address_left_tokens, the_address_right_tokens, max_seq_len - 3)
      tokens = ["[CLS]"] + the_address_left_tokens+ ["[SEP]"] + the_address_right_tokens + ["[SEP]"]
      segment_ids = ([0] * (len(the_address_left_tokens) + 2 )) + ([1] * (len(the_address_right_tokens) + 1 ))
      input_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_mask = [1] * len(input_ids)
      padding = [0] * (max_seq_len - len(input_ids))
      input_ids += padding
      input_mask += padding
      segment_ids += padding
      assert len(input_ids) == max_seq_len
      assert len(input_mask) == max_seq_len
      assert len(segment_ids) == max_seq_len
      input_id_list.append(input_ids)
      input_mask_list.append(input_mask)
      segment_id_list.append(segment_ids)
  return input_id_list, input_mask_list, segment_id_list

In [34]:
# do_lower_case = true
import numpy as np
t_input_id_list, t_input_mask_list, t_segment_id_list = process_data(["RELIABLE SHIPPING"], ['GE CAPITAL EUROPE LIMITED'], None)
print(np.array(t_input_id_list), np.array(t_input_mask_list), np.array(t_segment_id_list))

[[  101 10539  7829   102 16216  3007  2885  3132   102     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [35]:
test_preprocess_model = make_bert_preprocess_model(['my_input1', 'my_input2'])
test_text = [np.array(["RELIABLE SHIPPING"]),
             np.array(['GE CAPITAL EUROPE LIMITED'])]
text_preprocessed = test_preprocess_model(test_text)

print('Keys           : ', list(text_preprocessed.keys()))
print('Shape Word Ids : ', text_preprocessed['input_word_ids'].shape)
print('Word Ids       : ', text_preprocessed['input_word_ids'][0, :16])
print('Shape Mask     : ', text_preprocessed['input_mask'].shape)
print('Input Mask     : ', text_preprocessed['input_mask'][0, :16])
print('Shape Type Ids : ', text_preprocessed['input_type_ids'].shape)
print('Type Ids       : ', text_preprocessed['input_type_ids'][0, :16])

Keys           :  ['input_word_ids', 'input_mask', 'input_type_ids']
Shape Word Ids :  (1, 256)
Word Ids       :  tf.Tensor(
[  101 10539  7829   102 16216  3007  2885  3132   102     0     0     0
     0     0     0     0], shape=(16,), dtype=int32)
Shape Mask     :  (1, 256)
Input Mask     :  tf.Tensor([1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0], shape=(16,), dtype=int32)
Shape Type Ids :  (1, 256)
Type Ids       :  tf.Tensor([0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0], shape=(16,), dtype=int32)


In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
main_save_path = "/content/drive/My Drive/name_matching_attempt/name_matching_model_v1_r6_with_data_augmentation_text"
bert_type = tfhub_handle_encoder.split('/')[-2]
saved_model_name = f'{"name_matching_v1"}_{bert_type}'
saved_model_path = os.path.join(main_save_path, saved_model_name)

In [43]:
reloaded_model = tf.saved_model.load(saved_model_path)

In [44]:
reloaded_model.signatures

_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, input_word_ids, input_mask, input_type_ids) at 0x7F8001EDC650>})

In [45]:
type(reloaded_model)

tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject

In [46]:
tinput_id, tinput_mask, tsegment_id = process_data(['RELIABLE SHIPPING'], ['GE CAPITAL EUROPE LIMITED'], None)
result = reloaded_model({'input_mask': tf.constant(tinput_mask, dtype=tf.int32), 'input_type_ids': tf.constant(tsegment_id, dtype=tf.int32), 'input_word_ids': tf.constant(tinput_id, dtype=tf.int32)})
print(result)
print(tf.argmax(result, axis=1)[0])

tf.Tensor([[0.9986733  0.00132668]], shape=(1, 2), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)


In [47]:
tinput_id, tinput_mask, tsegment_id = process_data(['VITESSE'], ['FOUR (HOLDINGS) LIMITED'], None)
result = reloaded_model({'input_mask': tf.constant(tinput_mask, dtype=tf.int32), 'input_type_ids': tf.constant(tsegment_id, dtype=tf.int32), 'input_word_ids': tf.constant(tinput_id, dtype=tf.int32)})
print(result)
print(tf.argmax(result, axis=1)[0])

tf.Tensor([[9.9977368e-01 2.2633336e-04]], shape=(1, 2), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)


In [48]:
tinput_id, tinput_mask, tsegment_id = process_data(['KGM UNDERWRITING', "test", "THE COOPER GROUP"], ['TESSUTI LIMITED', "test2", "UTMOST GROUP"], None)
result = reloaded_model({'input_mask': tf.constant(tinput_mask, dtype=tf.int32), 'input_type_ids': tf.constant(tsegment_id, dtype=tf.int32), 'input_word_ids': tf.constant(tinput_id, dtype=tf.int32)})
print(result)
print(tf.argmax(result, axis=1)[0])

tf.Tensor(
[[9.997526e-01 2.474208e-04]
 [4.181787e-03 9.958182e-01]
 [9.280080e-01 7.199199e-02]], shape=(3, 2), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)
