## Modules and models

In [1]:
!pip install tensorflow-gpu 
# !pip install tensorflow
!pip install tensorflow_hub matplotlib tokenizers
!pip install tensorflow_text
!pip install bert-for-tf2
!pip install transformers

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/18/99/ac32fd13d56e40d4c3e6150030132519997c0bb1f06f448d970e81b177e5/tensorflow_gpu-2.3.1-cp36-cp36m-manylinux2010_x86_64.whl (320.4MB)
[K     |████████████████████████████████| 320.4MB 50kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.3.1
Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 9.0MB/s 
Installing collected packages: tokenizers
Successfully installed tokenizers-0.9.4
Collecting tensorflow_text
[?25l  Downloading https://files.pythonhosted.org/packages/28/b2/2dbd90b93913afd07e6101b8b84327c401c394e60141c1e98590038060b3/tensorflow_text-2.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 10.1MB/s 
Insta

In [2]:
# Create data folder
!mkdir /content/pre_model

In [3]:
!mkdir /content/pre_model/zh_roberta_wwm
!mkdir /content/pre_model/albert_base_zh
!mkdir /content/pre_model/multi_cased_bert

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!cp '/content/drive/MyDrive/Colab Notebooks/pretrained_models/chinese_wwm_ext_L-12_H-768_A-12.zip' /content/pre_model/

In [6]:
!unzip -o '/content/pre_model/chinese_wwm_ext_L-12_H-768_A-12.zip' -d '/content/pre_model/chinese_wwm_ext_L-12_H-768_A-12'

Archive:  /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12.zip
  inflating: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json  
  inflating: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt  


In [7]:
## Local model, take bert as example
## Download bert model
# https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz
# !wget https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz


In [8]:
## Unzip the file
# !tar -zxvf multi_cased_L-12_H-768_A-12.tar.gz -C /content/pre_model/multi_cased_bert

In [9]:
# !wget 'https://code.aliyun.com/qhduan/zh-roberta-wwm/raw/2c0d7fd709e4719a9ab2ca297f51b24e20586dbe/zh-roberta-wwm-L12.tar.gz'

In [10]:
# !tar -zxvf zh-roberta-wwm-L12.tar.gz -C /content/pre_model/zh_roberta_wwm

In [11]:
# !wget https://storage.googleapis.com/albert_models/albert_base_zh.tar.gz
## Unzip the file
# !tar -zxvf albert_base_zh.tar.gz -C /content/pre_model/albert_base_zh


## Import modules

In [12]:
import pandas as pd
import numpy as np
import os
import json

os.environ['TFHUB_DOWNLOAD_PROGRESS'] = '1'

from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as tf_hub
import tensorflow as tf
import tensorflow_text as tf_text
from tokenizers import BertWordPieceTokenizer

import bert as bert_4_tf2

import matplotlib.pyplot as plt
%matplotlib inline

### Methods

In [92]:
def _compose_token_data_input(data, batch_size=32):
  X = [
       tokenizer.encode(x.get('q1'), x.get('q2')).tokens for x in tqdm(data)
  ]
  Y = [
       int(x.get('label')) for x in data
  ]
  X = tf.ragged.constant(X, tf.string)
  Y = tf.constant(Y, tf.int32)

  @tf.autograph.experimental.do_not_convert
  def _to_tensor(x, y):
    return x.to_tensor(), y

  return tf.data.Dataset.zip((
        tf.data.Dataset.from_tensor_slices(X),
        tf.data.Dataset.from_tensor_slices(Y)
  )).batch(batch_size).map(_to_tensor)



def _compose_bert_data_input_by_tokenizer(data, batch_size=32, max_seq_length=128, by_batch=True):
  """
  This one is still not wokring. Will throw different kinds of error when fitting into model.
  """
  X_word_ids = []
  X_masks = []
  X_seq = []

  for x in tqdm(data):
    
    input_ids = []
    
    segment_ids = []

    t_encoder = tokenizer.encode(x.get('q1'), x.get('q2'))
    t_encoder.pad(max_seq_length)
    tokens = t_encoder.tokens

    X_word_ids.append(t_encoder.ids)
    X_masks.append(t_encoder.attention_mask)
    X_seq.append(t_encoder.type_ids)

  Y = [
       int(x.get('label')) for x in tqdm(data)
  ]

  X_word_ids = tf.ragged.constant(X_word_ids, tf.int32, name="input_word_ids")
  X_masks = tf.ragged.constant(X_masks, tf.int32, name="attention_mask")
  X_seq = tf.ragged.constant(X_seq, tf.int32, name="token_type_ids")
  Y = tf.constant(Y, tf.int32)

  # @tf.autograph.experimental.do_not_convert
  def _to_tensor(
      x1, 
      x2, 
      x3, 
      y):
    return {
        "input_word_ids":x1.to_tensor(), 
        "attention_mask":x2.to_tensor(), 
        "token_type_ids":x3.to_tensor()
        }, y

  if by_batch:
    return tf.data.Dataset.zip((
        tf.data.Dataset.from_tensor_slices(X_word_ids),
        tf.data.Dataset.from_tensor_slices(X_masks),
        tf.data.Dataset.from_tensor_slices(X_seq),
        tf.data.Dataset.from_tensor_slices(Y),
    )).batch(batch_size).map(_to_tensor)
    # return tf.data.Dataset.from_tensor_slices(({
    #     "input_word_ids":X_word_ids, 
    #     "attention_mask":X_masks, 
    #     "token_type_ids":X_seq
    #     }, Y)).batch(batch_size)
  else:
    return tf.data.Dataset.zip((
        tf.data.Dataset.from_tensor_slices(X_word_ids),
        tf.data.Dataset.from_tensor_slices(X_masks),
        tf.data.Dataset.from_tensor_slices(X_seq),
        tf.data.Dataset.from_tensor_slices(Y),
    )).batch(1).map(_to_tensor)
    # return tf.data.Dataset.from_tensor_slices(({
    #     "input_word_ids":X_word_ids, 
    #     "attention_mask":X_masks, 
    #     "token_type_ids":X_seq
    #     }, Y)).batch(1)
  

def _compose_bert_data_input_by_tokenizer_to_dict(data, batch_size=32, max_seq_length=128):
  
  X_word_ids = []
  X_masks = []
  X_seq = []

  for x in tqdm(data):
    
    input_ids = []
    
    segment_ids = []

    t_encoder = tokenizer.encode(x.get('q1'), x.get('q2'))
    t_encoder.pad(max_seq_length)
    tokens = t_encoder.tokens

    X_word_ids.append(t_encoder.ids)
    X_masks.append(t_encoder.attention_mask)
    X_seq.append(t_encoder.type_ids)

  Y = [
       int(x.get('label')) for x in tqdm(data)
  ]

  X_word_ids = np.array(X_word_ids)
  X_masks = np.array(X_masks)
  X_seq = np.array(X_seq)
  Y = np.array(Y)

  # X_word_ids = tf.ragged.constant(X_word_ids, tf.int32, name="input_word_ids")
  # X_masks = tf.ragged.constant(X_masks, tf.int32, name="attention_mask")
  # X_seq = tf.ragged.constant(X_seq, tf.int32, name="token_type_ids")
  # Y = tf.constant(Y, tf.int32)
  return {
        "input_word_ids": X_word_ids,
        "attention_mask": X_masks,
        "token_type_ids": X_seq,
        "label": Y}

## Data

### Mayi finance texts semantic data

In [71]:
max_seq_len = 128

In [14]:
!wget https://raw.githubusercontent.com/ccuulinay/texts_semantic_sim/master/data/train.txt
!wget https://raw.githubusercontent.com/ccuulinay/texts_semantic_sim/master/data/test.txt

--2020-12-13 16:54:02--  https://raw.githubusercontent.com/ccuulinay/texts_semantic_sim/master/data/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5174256 (4.9M) [text/plain]
Saving to: ‘train.txt’


2020-12-13 16:54:02 (26.3 MB/s) - ‘train.txt’ saved [5174256/5174256]

--2020-12-13 16:54:02--  https://raw.githubusercontent.com/ccuulinay/texts_semantic_sim/master/data/test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2553277 (2.4M) [text/plain]
Saving to: ‘test.txt’


2020-12-13 16:54:03 (31.4 MB/s) - ‘test.txt’ saved [255

In [15]:
train_file = "train.txt"
test_file = "test.txt"

In [16]:
train = pd.read_csv(train_file, sep="\t", header=None, names=["q1", "q2", "label"])
test = pd.read_csv(test_file, sep="\t", header=None, names=["q1", "q2", "label"])

DATA_COLUMNS = ["q1", "q2"]
LABEL_COLUMN = 'label'

In [17]:
_train_0 = train[train[LABEL_COLUMN] == 0].sample(2500)
_train_1 = train[train[LABEL_COLUMN] == 1].sample(2500)
train = pd.concat([_train_0, _train_1])

_test_0 = test[test[LABEL_COLUMN] == 0].sample(2500)
_test_1 = test[test[LABEL_COLUMN] == 1].sample(2500)
test = pd.concat([_test_0, _test_1])


In [90]:
tokenizer = BertWordPieceTokenizer("/content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt")
tokenizer.enable_truncation(128)
# tokenizer = BertWordPieceTokenizer("/content/pre_model/multi_cased_bert/multi_cased_L-12_H-768_A-12/vocab.txt")
# tokenizer = BertWordPieceTokenizer("/content/pre_model/albert_base_zh/albert_base/vocab_chinese.txt")

In [93]:
# data_train = _compose_token_data_input(train.to_dict(orient='records'))
# data_test = _compose_token_data_input(test.to_dict(orient='records'))

data_train = _compose_bert_data_input_by_tokenizer(train.to_dict(orient='records'))
data_test = _compose_bert_data_input_by_tokenizer(test.to_dict(orient='records'))

dict_train = _compose_bert_data_input_by_tokenizer_to_dict(train.to_dict(orient='records'))
dict_test = _compose_bert_data_input_by_tokenizer_to_dict(test.to_dict(orient='records'))

100%|██████████| 5000/5000 [00:00<00:00, 9861.36it/s]
100%|██████████| 5000/5000 [00:00<00:00, 995042.70it/s]
100%|██████████| 5000/5000 [00:00<00:00, 9922.46it/s]
100%|██████████| 5000/5000 [00:00<00:00, 930537.34it/s]
100%|██████████| 5000/5000 [00:00<00:00, 9955.32it/s] 
100%|██████████| 5000/5000 [00:00<00:00, 959838.89it/s]
100%|██████████| 5000/5000 [00:00<00:00, 9855.30it/s]
100%|██████████| 5000/5000 [00:00<00:00, 1320791.03it/s]


In [20]:
# test.to_dict(orient='records')[3000]

In [21]:
# for x, y in data_test.take(1):
#   print(x, y)

In [22]:
t_obj = tokenizer.encode(test.iloc[0]['q1'], train.iloc[0]['q2'])
t_obj.pad(128)
a = t_obj.tokens
word_ids = [] 
mask_list = t_obj.attention_mask
seq_list = t_obj.type_ids
for t in a:
  word_ids.append(tokenizer.token_to_id(t))
  # print(tokenizer.token_to_id(t))

print(f"sentence ({len(a)}): {a}")
print(f"word ids ({len(word_ids)}) {type(word_ids)}: {word_ids}")
print(f"mask ({len(mask_list)}): {mask_list}")
print(f"seq ({len(seq_list)}): {seq_list}")

sentence (128): ['[CLS]', '我', '想', '开', '通', '花', '呗', '。', '可', '以', '吗', '[SEP]', '能', '不', '能', '提', '升', '下', '我', '的', '花', '呗', '额', '度', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

## Build model

In [23]:
# bert_layer = tf_hub.KerasLayer(

#     # 'https://code.aliyun.com/qhduan/zh-roberta-wwm/raw/2c0d7fd709e4719a9ab2ca297f51b24e20586dbe/zh-roberta-wwm-L12.tar.gz'
#     # 'https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/3'
#     , output_key='sequence_output'
#     , trainable=True
#     )

In [24]:
# google_bert_layer = tf_hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/1", trainable=True)

In [27]:
wwm_model_dir = '/content/pre_model/chinese_wwm_ext_L-12_H-768_A-12'
wwm_model_ckpt = os.path.join(wwm_model_dir, "bert_model.ckpt")

# multi_bert_dir = '/content/pre_model/multi_cased_bert/multi_cased_L-12_H-768_A-12'
# multi_bert_ckpt = os.path.join(multi_bert_dir, "bert_model.ckpt")

# albert_model_dir = '/content/pre_model/albert_base_zh/albert_base'
# albert_ckpt = os.path.join(albert_model_dir, "model.ckpt-best")

In [28]:
wwm_model_params = bert_4_tf2.params_from_pretrained_ckpt(wwm_model_dir)
wwm_cn_bert = bert_4_tf2.BertModelLayer.from_params(wwm_model_params, name='bert')

# multi_bert_params = bert_4_tf2.params_from_pretrained_ckpt(multi_bert_dir)
# multi_bert = bert_4_tf2.BertModelLayer.from_params(multi_bert_params, name='bert')

# albert_model_params = bert_4_tf2.albert_params(albert_model_dir)
# albert_cn = bert_4_tf2.BertModelLayer.from_params(albert_model_params, name="albert")

In [29]:
max_seq_len = max_seq_len
BATCH_SIZE = 32
# LEARNING_RATE = 3e-5
LEARNING_RATE = 2e-5
# LEARNING_RATE = 1e-5
# LEARNING_RATE = 0.000015
NUM_TRAIN_EPOCHS = 5

In [30]:
bert_layer = wwm_cn_bert
bert_model_ckpt = wwm_model_ckpt

In [31]:

# # Way for tensorflow hub layer
# input_word_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
#                                        name="input_word_ids")
# input_mask = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
#                                    name="attention_mask")
# segment_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
#                                     name="token_type_ids")

# m_pooled_output, m_sequence_output = google_bert_layer([input_word_ids, input_mask, segment_ids])
# # m_pooled_output, m_sequence_output = wwm_cn_bert(
# #     input_word_ids, 
# #     token_type_ids=segment_ids, 
# #     attention_mask=input_mask
# # )

# cls_output = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(m_sequence_output)

In [32]:
input_word_tokens = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.string, name="input_tokens")

input_word_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
                                   name="attention_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32,
                                    name="token_type_ids")

# m_pooled_output, m_sequence_output = multi_bert([input_word_ids, segment_ids])
# output = multi_bert([input_word_ids, segment_ids])
# output = wwm_cn_bert([input_word_ids, segment_ids])
output = bert_layer([input_word_ids, segment_ids])
pooled = tf.keras.layers.GlobalAveragePooling1D()(output)
flat = tf.keras.layers.Flatten()(output)
logits = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(pooled)

# bert_4_tf2.load_bert_weights(multi_bert, multi_bert_ckpt)
# cls = tf.keras.Model(inputs=inputs_list, outputs=logits)

In [33]:
# 3 dim input is for tf_hub to create bert keras layers
# embed_model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[m_pooled_output, m_sequence_output])
# cls = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=cls_output)

cls = tf.keras.Model(inputs=[input_word_ids, segment_ids], outputs=logits)

In [34]:
## !!! This step is import when using bert_4_keras. ###

# bert_4_tf2.load_albert_weights(bert_layer, bert_model_ckpt)
bert_4_tf2.load_bert_weights(bert_layer, bert_model_ckpt)

Done loading 197 BERT weights from: /content/pre_model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f56e899d198> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
	global_step


[]

In [35]:
cls.compile(
    loss='binary_crossentropy', 
    optimizer=tf.optimizers.Adam(lr=LEARNING_RATE), 
    metrics=['accuracy']
)


In [36]:
# str_inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.string)

# m = str_inputs
# m = bert_layer(m)
# m = tf.keras.layers.Masking()(m)
# m = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(m)
# m = tf.keras.layers.Dense(2, activation='softmax')(m)
# model = tf.keras.Model(inputs=inputs, outputs=m)

In [37]:
cls.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert (BertModelLayer)           (None, 128, 768)     101677056   input_word_ids[0][0]             
                                                                 token_type_ids[0][0]             
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 768)          0           bert[0][0]            

In [99]:
 # v = data_test.take(100)
history = cls.fit(
    [dict_train["input_word_ids"], dict_train["token_type_ids"]],
    dict_train["label"],
    # epochs=5,
    epochs=NUM_TRAIN_EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=([dict_test["input_word_ids"], dict_test["token_type_ids"]], dict_test["label"]),
    verbose=1
    )

Epoch 1/5

KeyboardInterrupt: ignored

In [None]:
# history = cls.fit(
#     data_train, 
#     epochs=2,
#     validation_data=data_test
# )

In [None]:
cls.evaluate(data_test)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Binary Cross Entropy')
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
for i in data_train.take(1):
  print(i)