In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import bert
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from unidecode import unidecode
from sklearn.utils import shuffle
from tqdm import tqdm
from prepro_utils import preprocess_text, encode_ids, encode_pieces
from malaya.text.function import transformer_textcleaning as cleaning
from tensorflow.python.estimator.run_config import RunConfig
import bert_utils as squad_utils




In [3]:
import tensorflow as tf
import logging

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.get_logger().setLevel(logging.ERROR)
tf.compat.v1.autograph.set_verbosity(1)

In [4]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}


class Tokenizer:
    def __init__(self, v, sp_model):
        self.vocab = v
        self.sp_model = sp_model

    def tokenize(self, string):
        return encode_pieces(
            self.sp_model, string, return_unicode = False, sample = False
        )

    def convert_tokens_to_ids(self, tokens):
        return [self.sp_model.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.sp_model.IdToPiece(i) for i in ids]


tokenizer = Tokenizer(v, sp_model)

In [5]:
import pickle

with open('bert-squad-test.pkl', 'rb') as fopen:
    test_features, test_examples = pickle.load(fopen)

In [6]:
test_examples[0].__dict__

{'qas_id': '56ddde6b9a695914005b9628',
 'question_text': 'Di negara manakah Normandy berada?',
 'paragraph_text': 'Orang Norman (Norman: Nourmands; Perancis: Normands; Latin: Normanni) ialah orang-orang yang pada abad ke-10 dan ke-11 memberikan nama mereka kepada Normandy, sebuah wilayah di Perancis. Mereka diturunkan daripada Norse ("Norman" berasal daripada penyerang "Norseman") dan lanun dari Denmark, Iceland dan Norway yang, di bawah pimpinan mereka Rollo, bersetuju untuk bersumpah fealty kepada Raja Charles III dari Francia Barat. Melalui generasi asimilasi dan percampuran dengan penduduk asli Frankish dan Roman-Gaulish, keturunan mereka akan beransur-ansur bergabung dengan budaya Carolingian yang berpusat di Francia Barat. Identiti budaya dan etnik yang berbeza dari orang Norman muncul pada mulanya pada separuh pertama abad ke-10, dan ia terus berkembang pada abad-abad yang berjaya.',
 'orig_answer_text': None,
 'start_position': None,
 'end_position': None,
 'is_impossible': Fal

In [7]:
max_seq_length = 384
doc_stride = 128
max_query_length = 64

In [8]:
bert_config = modeling.BertConfig.from_json_file(
    'tiny-bert-v1/config.json'
)

In [11]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.segment_ids = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.p_mask = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        final_hidden = model.get_sequence_output()
        self.output = final_hidden
        vectorize = tf.compat.v1.identity(final_hidden, name = 'logits_vectorize')

In [12]:
learning_rate = 2e-5
start_n_top = 5
end_n_top = 5
is_training = False

tf.compat.v1.reset_default_graph()
model = Model(is_training = is_training)

In [13]:
output = model.output
bsz = tf.compat.v1.shape(output)[0]
return_dict = {}
output = tf.compat.v1.transpose(output, [1, 0, 2])

# invalid position mask such as query and special symbols (PAD, SEP, CLS)
p_mask = tf.compat.v1.cast(model.p_mask, dtype = tf.compat.v1.float32)

# logit of the start position
with tf.compat.v1.variable_scope('start_logits'):
    start_logits = tf.layers.dense(
        output,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
    )
    start_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.compat.v1.nn.log_softmax(start_logits_masked, -1)

# logit of the end position
with tf.compat.v1.variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position
        start_positions = tf.compat.v1.reshape(model.start_positions, [-1])
        start_index = tf.compat.v1.one_hot(
            start_positions,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.compat.v1.float32,
        )
        start_features = tf.compat.v1.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.compat.v1.tile(
            start_features[None], [max_seq_length, 1, 1]
        )
        end_logits = tf.layers.dense(
            tf.compat.v1.concat([output, start_features], axis = -1),
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.transpose(tf.compat.v1.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.compat.v1.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.compat.v1.one_hot(
            start_top_index,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.compat.v1.float32,
        )
        start_features = tf.compat.v1.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.compat.v1.tile(output[:, :, None], [1, 1, start_n_top, 1])
        start_features = tf.compat.v1.tile(
            start_features[None], [max_seq_length, 1, 1, 1]
        )
        end_input = tf.compat.v1.concat([end_input, start_features], axis = -1)
        end_logits = tf.layers.dense(
            end_input,
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.compat.v1.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.compat.v1.reshape(
            end_logits, [max_seq_length, -1, start_n_top]
        )
        end_logits = tf.compat.v1.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.compat.v1.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.compat.v1.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.compat.v1.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.compat.v1.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )
        
if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with tf.compat.v1.variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.compat.v1.one_hot(
        tf.compat.v1.zeros([bsz], dtype = tf.compat.v1.int32),
        max_seq_length,
        axis = -1,
        dtype = tf.compat.v1.float32,
    )
    cls_feature = tf.compat.v1.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.compat.v1.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.compat.v1.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.compat.v1.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        bert_config.hidden_size,
        activation = tf.compat.v1.tanh,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_0',
    )
    ans_feature = tf.layers.dropout(
        ans_feature, bert_config.hidden_dropout_prob, training = is_training
    )
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.compat.v1.squeeze(cls_logits, -1)
    
return_dict['cls_logits'] = cls_logits

In [14]:
sess = tf.compat.v1.InteractiveSession()
sess.run(tf.compat.v1.global_variables_initializer())
saver = tf.compat.v1.train.Saver(var_list = tf.compat.v1.trainable_variables())
saver.restore(sess, 'tiny-bert-squad/model.ckpt')

In [15]:
start_top_log_probs = tf.compat.v1.identity(start_top_log_probs, name = 'start_top_log_probs')
start_top_index = tf.compat.v1.identity(start_top_index, name = 'start_top_index')
end_top_log_probs = tf.compat.v1.identity(end_top_log_probs, name = 'end_top_log_probs')
end_top_index = tf.compat.v1.identity(end_top_index, name = 'end_top_index')
cls_logits = tf.compat.v1.identity(cls_logits, name = 'cls_logits')

In [16]:
i = 0
batch_size = 2
batch = test_features[i: i + batch_size]
batch_ids = [b.input_ids for b in batch]
batch_masks = [b.input_mask for b in batch]
batch_segment = [b.segment_ids for b in batch]
batch_start = [b.start_position for b in batch]
batch_end = [b.end_position for b in batch]
is_impossible = [b.is_impossible for b in batch]
p_mask = [b.p_mask for b in batch]
o = sess.run(
    [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits],
    feed_dict = {
        model.X: batch_ids,
        model.segment_ids: batch_segment,
        model.input_masks: batch_masks,
        model.p_mask: p_mask
    },
)

In [17]:
saver = tf.compat.v1.train.Saver(tf.compat.v1.trainable_variables())
saver.save(sess, 'output-tiny-bert-squad/model.ckpt')

'output-tiny-bert-squad/model.ckpt'

In [18]:
strings = ','.join(
    [
        n.name
        for n in tf.compat.v1.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'start_' in n.name
        or 'end_' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 'bert/encoder/layer_1/attention/self/query/kernel',
 'bert/encode

In [19]:
def freeze_graph(model_dir, output_node_names):

    if not tf.compat.v1.io.gfile.exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.compat.v1.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.compat.v1.Session(graph = tf.compat.v1.Graph()) as sess:
        saver = tf.compat.v1.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
            sess,
            tf.compat.v1.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.compat.v1.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [20]:
freeze_graph('output-tiny-bert-squad', strings)

932 ops in the final graph.


In [21]:
def load_graph(frozen_graph_filename):
    with tf.compat.v1.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())
        
    for node in graph_def.node:
        if node.op == 'RefSwitch':
            node.op = 'Switch'
            for index in xrange(len(node.input)):
                if 'moving_' in node.input[index]:
                    node.input[index] = node.input[index] + '/read'
        elif node.op == 'AssignSub':
            node.op = 'Sub'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'AssignAdd':
            node.op = 'Add'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'Assign':
            node.op = 'Identity'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
            if 'validate_shape' in node.attr:
                del node.attr['validate_shape']
            if len(node.input) == 2:
                node.input[0] = node.input[1]
                del node.input[1]
                
    with tf.compat.v1.Graph().as_default() as graph:
        tf.compat.v1.import_graph_def(graph_def)
    return graph

In [22]:
g = load_graph('output-tiny-bert-squad/frozen_model.pb')

In [23]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [24]:
from tensorflow.tools.graph_transforms import TransformGraph
tf.compat.v1.set_random_seed(0)

In [25]:
pb = 'output-tiny-bert-squad/frozen_model.pb'

input_graph_def = tf.compat.v1.GraphDef()
with tf.compat.v1.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())
    
inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3']
outputs = ['start_top_log_probs',
 'start_top_index',
 'end_top_log_probs',
 'end_top_index',
 'cls_logits',
 'logits_vectorize']

transformed_graph_def = TransformGraph(input_graph_def, 
                                           inputs,
                                           outputs, transforms)

with tf.compat.v1.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

In [26]:
g = load_graph('output-tiny-bert-squad/frozen_model.pb.quantized')

In [29]:
input_nodes = {i: g.get_tensor_by_name(f'import/{i}:0') for i in inputs}
output_nodes = {i: g.get_tensor_by_name(f'import/{i}:0') for i in outputs}

In [31]:
test_sess = tf.compat.v1.InteractiveSession(graph = g)

In [32]:
output_nodes['start_top_log_probs']

<tf.compat.v1.Tensor 'import/start_top_log_probs:0' shape=(?, 5) dtype=float32>

In [33]:
b = [batch_ids, batch_segment, batch_masks, p_mask]
b = {input_nodes[i]: b[no] for no, i in enumerate(inputs)}

In [34]:
o = test_sess.run(
    output_nodes, feed_dict = b,
)

In [35]:
o

{'start_top_log_probs': array([[-0.06966875, -3.240222  , -4.473772  , -5.298744  , -5.6682377 ],
        [-0.25609264, -2.318633  , -2.9327104 , -3.522881  , -4.5987473 ]],
       dtype=float32), 'start_top_index': array([[56, 54, 22, 55,  0],
        [39, 38, 46,  0, 44]], dtype=int32), 'end_top_log_probs': array([[-6.5866276e-03, -5.1756473e+00, -7.5562983e+00, -9.1460800e+00,
         -9.4442930e+00, -4.5594490e-01, -1.7612027e+00, -2.4332027e+00,
         -2.4676924e+00, -4.4934702e+00, -1.8880528e-01, -2.3249977e+00,
         -3.7639778e+00, -3.7674901e+00, -4.8629518e+00, -3.7221590e-01,
         -1.5458037e+00, -2.7700737e+00, -3.8536804e+00, -5.2050881e+00,
         -3.6656349e-03, -6.7355733e+00, -8.3460884e+00, -9.1573057e+00,
         -9.2870998e+00],
        [-1.8938763e-01, -1.7753700e+00, -6.5651364e+00, -7.5663762e+00,
         -8.2851267e+00, -1.1159354e-01, -2.4293358e+00, -5.1858535e+00,
         -5.7707458e+00, -6.0979123e+00, -1.4081408e-02, -4.3420844e+00,
       