In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import bert
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from unidecode import unidecode
from sklearn.utils import shuffle
from tqdm import tqdm
from prepro_utils import preprocess_text, encode_ids, encode_pieces
from malaya.text.function import transformer_textcleaning as cleaning
from tensorflow.python.estimator.run_config import RunConfig
import bert_utils as squad_utils




In [3]:
import tensorflow as tf
import logging

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.get_logger().setLevel(logging.ERROR)
tf.compat.v1.autograph.set_verbosity(1)

In [4]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}


class Tokenizer:
    def __init__(self, v, sp_model):
        self.vocab = v
        self.sp_model = sp_model

    def tokenize(self, string):
        return encode_pieces(
            self.sp_model, string, return_unicode = False, sample = False
        )

    def convert_tokens_to_ids(self, tokens):
        return [self.sp_model.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.sp_model.IdToPiece(i) for i in ids]


tokenizer = Tokenizer(v, sp_model)

In [5]:
import pickle

with open('bert-squad-train.pkl', 'rb') as fopen:
    train_features, train_examples = pickle.load(fopen)

In [6]:
max_seq_length = 384
doc_stride = 128
max_query_length = 64

In [7]:
bert_config = modeling.BertConfig.from_json_file(
    'tiny-bert-v1/config.json'
)

In [8]:
epoch = 5
batch_size = 22
warmup_proportion = 0.1
n_best_size = 20
num_train_steps = int(len(train_features) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [9]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.segment_ids = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.start_positions = tf.compat.v1.placeholder(tf.compat.v1.int32, [None])
        self.end_positions = tf.compat.v1.placeholder(tf.compat.v1.int32, [None])
        self.p_mask = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.is_impossible = tf.compat.v1.placeholder(tf.compat.v1.int32, [None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        final_hidden = model.get_sequence_output()
        self.output = final_hidden

In [10]:
learning_rate = 2e-5
is_training = True

tf.compat.v1.reset_default_graph()
model = Model(is_training = is_training)

In [11]:
output = model.output
bsz = tf.compat.v1.shape(output)[0]
return_dict = {}
output = tf.compat.v1.transpose(output, [1, 0, 2])

# invalid position mask such as query and special symbols (PAD, SEP, CLS)
p_mask = tf.compat.v1.cast(model.p_mask, dtype = tf.compat.v1.float32)

# logit of the start position
with tf.compat.v1.variable_scope('start_logits'):
    start_logits = tf.layers.dense(
        output,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
    )
    start_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.compat.v1.nn.log_softmax(start_logits_masked, -1)

# logit of the end position
with tf.compat.v1.variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position
        start_positions = tf.compat.v1.reshape(model.start_positions, [-1])
        start_index = tf.compat.v1.one_hot(
            start_positions,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.compat.v1.float32,
        )
        start_features = tf.compat.v1.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.compat.v1.tile(
            start_features[None], [max_seq_length, 1, 1]
        )
        end_logits = tf.layers.dense(
            tf.compat.v1.concat([output, start_features], axis = -1),
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.compat.v1.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.compat.v1.one_hot(
            start_top_index,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.compat.v1.float32,
        )
        start_features = tf.compat.v1.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.compat.v1.tile(output[:, :, None], [1, 1, start_n_top, 1])
        start_features = tf.compat.v1.tile(
            start_features[None], [max_seq_length, 1, 1, 1]
        )
        end_input = tf.compat.v1.concat([end_input, start_features], axis = -1)
        end_logits = tf.layers.dense(
            end_input,
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.reshape(
            end_logits, [max_seq_length, -1, start_n_top]
        )
        end_logits = tf.compat.v1.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.compat.v1.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.compat.v1.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.compat.v1.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )
        
if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with tf.compat.v1.variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.compat.v1.one_hot(
        tf.compat.v1.zeros([bsz], dtype = tf.compat.v1.int32),
        max_seq_length,
        axis = -1,
        dtype = tf.compat.v1.float32,
    )
    cls_feature = tf.compat.v1.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.compat.v1.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.compat.v1.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.compat.v1.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        bert_config.hidden_size,
        activation = tf.compat.v1.tanh,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_0',
    )
    ans_feature = tf.layers.dropout(
        ans_feature, bert_config.hidden_dropout_prob, training = is_training
    )
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.compat.v1.squeeze(cls_logits, -1)
    
return_dict['cls_logits'] = cls_logits

In [12]:
seq_length = tf.compat.v1.shape(model.X)[1]

def compute_loss(log_probs, positions):
    one_hot_positions = tf.compat.v1.one_hot(
        positions, depth = seq_length, dtype = tf.compat.v1.float32
    )

    loss = -tf.compat.v1.reduce_sum(one_hot_positions * log_probs, axis = -1)
    loss = tf.compat.v1.reduce_mean(loss)
    return loss

start_loss = compute_loss(
    return_dict['start_log_probs'], model.start_positions
)
end_loss = compute_loss(
    return_dict['end_log_probs'], model.end_positions
)

total_loss = (start_loss + end_loss) * 0.5

cls_logits = return_dict['cls_logits']
is_impossible = tf.compat.v1.reshape(model.is_impossible, [-1])
regression_loss = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(
    labels = tf.compat.v1.cast(is_impossible, dtype = tf.compat.v1.float32),
    logits = cls_logits,
)
regression_loss = tf.compat.v1.reduce_mean(regression_loss)

# note(zhiliny): by default multiply the loss by 0.5 so that the scale is
# comparable to start_loss and end_loss
total_loss += regression_loss * 0.5

In [13]:
optimizer = optimization.create_optimizer(total_loss, learning_rate, 
                                          num_train_steps, num_warmup_steps, False)

In [14]:
sess = tf.compat.v1.InteractiveSession()
sess.run(tf.compat.v1.global_variables_initializer())
# var_lists = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
# saver = tf.compat.v1.train.Saver(var_list = var_lists)
# saver.restore(sess, 'tiny-bert-v1/model.ckpt')

saver = tf.compat.v1.train.Saver(var_list = tf.compat.v1.trainable_variables())
saver.restore(sess, 'tiny-bert-squad/model.ckpt')

In [15]:
from tqdm import tqdm

In [18]:
for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_features), batch_size), desc = 'train minibatch loop'
    )
    costs, start_losses, end_losses, regression_losses = [], [], [], []
    for i in pbar:
        batch = train_features[i: i + batch_size]
        batch_ids = [b.input_ids for b in batch]
        batch_masks = [b.input_mask for b in batch]
        batch_segment = [b.segment_ids for b in batch]
        batch_start = [b.start_position for b in batch]
        batch_end = [b.end_position for b in batch]
        is_impossible = [b.is_impossible for b in batch]
        p_mask = [b.p_mask for b in batch]
        cost, start_loss_, end_loss_, regression_loss_, _ = sess.run(
            [total_loss, start_loss, end_loss, regression_loss, optimizer],
            feed_dict = {
                model.start_positions: batch_start,
                model.end_positions: batch_end,
                model.X: batch_ids,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks,
                model.is_impossible: is_impossible,
                model.p_mask: p_mask
            },
        )
        pbar.set_postfix(cost = cost, start_loss = start_loss_,
                        end_loss = end_loss_, regression_loss = regression_loss_)
        costs.append(cost)
        start_losses.append(start_loss_)
        end_losses.append(end_loss_)
        regression_losses.append(regression_loss_)
        
    print(f'epoch: {e}')
    print(np.mean(costs))
    print(np.mean(start_losses))
    print(np.mean(end_losses))
    print(np.mean(regression_losses))

train minibatch loop: 100%|██████████| 6023/6023 [12:51<00:00,  7.81it/s, cost=0.326, end_loss=0.000157, regression_loss=0.323, start_loss=0.328]   
train minibatch loop:   0%|          | 1/6023 [00:00<12:44,  7.88it/s, cost=1.92, end_loss=1.09, regression_loss=0.124, start_loss=2.62]

epoch: 0
1.2719319
1.5417314
0.5538086
0.44832382


train minibatch loop: 100%|██████████| 6023/6023 [12:46<00:00,  7.86it/s, cost=0.305, end_loss=0.000203, regression_loss=0.298, start_loss=0.311]
train minibatch loop:   0%|          | 1/6023 [00:00<13:35,  7.38it/s, cost=1.96, end_loss=0.978, regression_loss=0.164, start_loss=2.79]

epoch: 1
1.2726808
1.5423595
0.5542202
0.44878203


train minibatch loop: 100%|██████████| 6023/6023 [12:49<00:00,  7.83it/s, cost=0.449, end_loss=0.000205, regression_loss=0.427, start_loss=0.471]
train minibatch loop:   0%|          | 1/6023 [00:00<14:18,  7.01it/s, cost=1.96, end_loss=0.855, regression_loss=0.133, start_loss=2.94]

epoch: 2
1.2714281
1.5404748
0.5541636
0.44821802


train minibatch loop: 100%|██████████| 6023/6023 [12:51<00:00,  7.81it/s, cost=0.325, end_loss=0.000169, regression_loss=0.328, start_loss=0.322] 
train minibatch loop:   0%|          | 1/6023 [00:00<12:29,  8.04it/s, cost=1.95, end_loss=1.09, regression_loss=0.152, start_loss=2.65]

epoch: 3
1.2722083
1.5425301
0.5530732
0.44881335


train minibatch loop: 100%|██████████| 6023/6023 [12:45<00:00,  7.87it/s, cost=0.458, end_loss=0.000168, regression_loss=0.449, start_loss=0.467]

epoch: 4
1.2729954
1.5415951
0.554876
0.4495196





In [None]:
saver = tf.compat.v1.train.Saver(tf.compat.v1.trainable_variables())
saver.save(sess, 'tiny-bert-squad/model.ckpt')