In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import bert
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from unidecode import unidecode
from sklearn.utils import shuffle
from prepro_utils import preprocess_text, encode_ids, encode_pieces
from malaya.text.function import transformer_textcleaning as cleaning
from tensorflow.python.estimator.run_config import RunConfig




In [3]:
with open('/home/husein/alxlnet/topics.json') as fopen:
    topics = set(json.load(fopen).keys())

list_topics = list(topics)

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}


class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass

    def tokenize(self, string):
        return encode_pieces(
            sp_model, string, return_unicode = False, sample = False
        )

    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]


tokenizer = Tokenizer(v)

In [4]:
def F(text):
    tokens_a = tokenizer.tokenize(text)
    tokens = ['[CLS]'] + tokens_a + ['[SEP]']
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    return input_id, input_mask


def XY(data):

    if len(set(data[1]) & topics) and random.random() > 0.2:
        t = random.choice(data[1])
        label = 1
    else:
        s = set(data[1]) | set()
        t = random.choice(list(topics - s))
        label = 0
    X = F(cleaning(data[0]))
    Y = F(t)

    return X, Y, label

In [5]:
with open('/home/husein/alxlnet/testset-keyphrase.json') as fopen:
    data = json.load(fopen)

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
def create_initializer(initializer_range = 0.02):
    return tf.truncated_normal_initializer(stddev = initializer_range)


def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if 'bert/' + name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable['bert/' + name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [8]:
batch_size = 60
warmup_proportion = 0.1
num_train_steps = 1000000
num_warmup_steps = int(num_train_steps * warmup_proportion)
learning_rate = 2e-5

In [9]:
bert_config = modeling.BertConfig.from_json_file(
    'tiny-bert-v1/config.json'
)

class Model:
    def __init__(
        self,
        dimension_output = 2,
    ):
        self.X = tf.compat.v1.placeholder(tf.int32, [None, None])
        self.input_masks = tf.compat.v1.placeholder(tf.float32, [None, None])
        
        self.X_b = tf.compat.v1.placeholder(tf.int32, [None, None])
        self.input_masks_b = tf.compat.v1.placeholder(tf.float32, [None, None])
        
        self.Y = tf.compat.v1.placeholder(tf.int32, [None])
        
        with tf.compat.v1.variable_scope('bert', reuse = False):
            model = modeling.BertModel(
                config = bert_config,
                is_training = True,
                input_ids = self.X,
                input_mask = self.input_masks,
                use_one_hot_embeddings = False,
            )

            summary = model.get_pooled_output()
            summary = tf.identity(summary, name = 'summary')
            self.summary = summary
            
        with tf.compat.v1.variable_scope('bert', reuse = True):
            model = modeling.BertModel(
                config = bert_config,
                is_training = True,
                input_ids = self.X_b,
                input_mask = self.input_masks_b,
                use_one_hot_embeddings = False,
            )

            summary_b = model.get_pooled_output()
        
        vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)]
        vectors_concat = tf.concat(vectors_concat, axis = 1)
        
        self.logits = tf.layers.dense(vectors_concat, dimension_output)
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))




In [10]:
dimension_output = 2

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
)

sess.run(tf.global_variables_initializer())




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [11]:
checkpoint = 'tiny-bert-keyphrase/model.ckpt-620000'
saver = tf.train.Saver(var_list = tf.trainable_variables())
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from tiny-bert-keyphrase/model.ckpt-620000


In [12]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'output-tiny-bert-keyphrase/model.ckpt')

'output-tiny-bert-keyphrase/model.ckpt'

In [13]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'summary' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'Identity' not in n.name
        and 'Assign' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Placeholder_4',
 'bert/bert/embeddings/word_embeddings',
 'bert/bert/embeddings/token_type_embeddings',
 'bert/bert/embeddings/position_embeddings',
 'bert/bert/embeddings/LayerNorm/gamma',
 'bert/bert/encoder/layer_0/attention/self/query/kernel',
 'bert/bert/encoder/layer_0/attention/self/query/bias',
 'bert/bert/encoder/layer_0/attention/self/key/kernel',
 'bert/bert/encoder/layer_0/attention/self/key/bias',
 'bert/bert/encoder/layer_0/attention/self/value/kernel',
 'bert/bert/encoder/layer_0/attention/self/value/bias',
 'bert/bert/encoder/layer_0/attention/self/Softmax',
 'bert/bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/bert/encoder/layer_0/attention/output/dense/bias',
 'bert/bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/bert/encoder/layer_0/intermediate/dense/bias',
 'bert/bert/encoder/layer_0/output/dense/kernel',
 'bert

In [14]:
def freeze_graph(model_dir, output_node_names):

    if not tf.io.gfile.exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [15]:
freeze_graph('output-tiny-bert-keyphrase', strings)

INFO:tensorflow:Restoring parameters from output-tiny-bert-keyphrase/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 73 variables.
INFO:tensorflow:Converted 73 variables to const ops.
1553 ops in the final graph.


In [16]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [17]:
g = load_graph('output-tiny-bert-keyphrase/frozen_model.pb')

In [18]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [19]:
from tensorflow.tools.graph_transforms import TransformGraph
tf.set_random_seed(0)

In [20]:
pb = 'output-tiny-bert-keyphrase/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())
    
inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3',]
outputs = ['bert/summary', 'logits']

transformed_graph_def = TransformGraph(input_graph_def, 
                                           inputs,
                                           outputs, transforms)

with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.
