In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/keyphrase/keyphrase-twitter-no-calon.json
# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/keyphrase/twitter-bahasa/topics.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
import json

with open('topics.json') as fopen:
    topics = set(json.load(fopen).keys())
    
list_topics = list(topics)

In [4]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils
import random




In [7]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

True

In [6]:
from malaya.text.function import transformer_textcleaning as cleaning

In [8]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [9]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

def F(left_train):
    tokens_a = tokenize_fn(left_train)
    segment_id = [SEG_ID_A] * len(tokens_a)
    tokens_a.append(SEP_ID)
    tokens_a.append(CLS_ID)
    segment_id.append(SEG_ID_A)
    segment_id.append(SEG_ID_CLS)
    input_mask = [0] * len(tokens_a)
    return tokens_a, segment_id, input_mask

def XY(data):
    
    if len(set(data[1]) & topics) and random.random() > 0.2:
        t = random.choice(data[1])
        label = 1
    else:
        s = (set(data[1]) | set())
        t = random.choice(list(topics - s))
        label = 0
    X = F(cleaning(data[0]))
    Y = F(t)
    
    return X, Y, label

In [10]:
with open('testset-keyphrase.json') as fopen:
    data = json.load(fopen)

In [11]:
data[0]

['Takdak gambar raya ', 'myburgerlab restaurant', 0]

In [12]:
F(data[0][1])

([284, 8751, 194, 8038, 5556, 1356, 7200, 4, 3],
 [0, 0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
class Parameter:
    def __init__(
        self,
        decay_method,
        warmup_steps,
        weight_decay,
        adam_epsilon,
        num_core_per_host,
        lr_layer_decay_rate,
        use_tpu,
        learning_rate,
        train_steps,
        min_lr_ratio,
        clip,
        **kwargs
    ):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip

num_train_steps = 300000
warmup_proportion = 0.1
num_warmup_steps = int(num_train_steps * warmup_proportion)
initial_learning_rate = 2e-5

kwargs = dict(
    is_training = True,
    use_tpu = False,
    use_bfloat16 = False,
    dropout = 0.1,
    dropatt = 0.1,
    init = 'normal',
    init_range = 0.1,
    init_std = 0.05,
    clamp_len = -1,
)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(
    json_path = 'alxlnet-base-2020-04-10/config.json'
)
training_parameters = dict(
    decay_method = 'poly',
    train_steps = num_train_steps,
    learning_rate = initial_learning_rate,
    warmup_steps = num_warmup_steps,
    min_lr_ratio = 0.0,
    weight_decay = 0.00,
    adam_epsilon = 1e-8,
    num_core_per_host = 1,
    lr_layer_decay_rate = 1,
    use_tpu = False,
    use_bfloat16 = False,
    dropout = 0.1,
    dropatt = 0.1,
    init = 'normal',
    init_range = 0.1,
    init_std = 0.05,
    clip = 1.0,
    clamp_len = -1,
)
training_parameters = Parameter(**training_parameters)




In [15]:
class Model:
    def __init__(
        self,
        dimension_output = 2,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        
        self.X_b = tf.placeholder(tf.int32, [None, None])
        self.segment_ids_b = tf.placeholder(tf.int32, [None, None])
        self.input_masks_b = tf.placeholder(tf.float32, [None, None])
        
        self.Y = tf.placeholder(tf.int32, [None])
        
        with tf.compat.v1.variable_scope('xlnet', reuse = False):
            xlnet_model = xlnet.XLNetModel(
                xlnet_config=xlnet_config,
                run_config=xlnet_parameters,
                input_ids=tf.transpose(self.X, [1, 0]),
                seg_ids=tf.transpose(self.segment_ids, [1, 0]),
                input_mask=tf.transpose(self.input_masks, [1, 0]))

            summary = xlnet_model.get_pooled_out("last", True)
            summary = tf.identity(summary, name = 'summary')
            self.summary = summary
            print(summary)
            
        with tf.compat.v1.variable_scope('xlnet', reuse = True):
            xlnet_model = xlnet.XLNetModel(
                xlnet_config=xlnet_config,
                run_config=xlnet_parameters,
                input_ids=tf.transpose(self.X_b, [1, 0]),
                seg_ids=tf.transpose(self.segment_ids_b, [1, 0]),
                input_mask=tf.transpose(self.input_masks_b, [1, 0]))
            summary_b = xlnet_model.get_pooled_out("last", True)
        
        vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)]
        vectors_concat = tf.concat(vectors_concat, axis = 1)
        
        self.logits = tf.layers.dense(vectors_concat, dimension_output)
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
dimension_output = 2

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Tensor("xlnet/summary:0", shape=(?, 768), dtype=float32)
INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>


In [17]:
tvars = tf.trainable_variables()
checkpoint = 'alxlnet-base-keyphrase/model.ckpt-180000'

In [18]:
saver = tf.train.Saver(var_list = tf.trainable_variables())
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from alxlnet-base-keyphrase/model.ckpt-180000


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
X = F('Kementerian Pertanian dan Industri Makanan menggalakkan pemain industri pertanian menceburi tanaman penting bagi mengurangkan kebergantungan bahan import dari luar negara')

o1 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [X[0]],
        model.segment_ids: [X[1]],
        model.input_masks: [X[2]],
    },
)

Y = F('tanaman jagung')

o2 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [Y[0]],
        model.segment_ids: [Y[1]],
        model.input_masks: [Y[2]],
    },
)

sess.run(
    model.logits,
    feed_dict = {
        model.X: [X[0]],
        model.segment_ids: [X[1]],
        model.input_masks: [X[2]],
        model.X_b: [Y[0]],
        model.segment_ids_b: [Y[1]],
        model.input_masks_b: [Y[2]],
    },
)

array([[0.41821438, 0.02677895]], dtype=float32)

In [21]:
cosine_similarity(o1, o2)

array([[0.28742066]], dtype=float32)

In [22]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'output-alxlnet-base-keyphrase/model.ckpt')

'output-alxlnet-base-keyphrase/model.ckpt'

In [23]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'summary' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'Identity' not in n.name
        and 'Assign' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'Placeholder_4',
 'Placeholder_5',
 'Placeholder_6',
 'xlnet/model/transformer/r_w_bias',
 'xlnet/model/transformer/r_r_bias',
 'xlnet/model/transformer/word_embedding/lookup_table',
 'xlnet/model/transformer/word_embedding/lookup_table_2',
 'xlnet/model/transformer/r_s_bias',
 'xlnet/model/transformer/seg_embed',
 'xlnet/model/transformer/layer_shared/rel_attn/q/kernel',
 'xlnet/model/transformer/layer_shared/rel_attn/k/kernel',
 'xlnet/model/transformer/layer_shared/rel_attn/v/kernel',
 'xlnet/model/transformer/layer_shared/rel_attn/r/kernel',
 'xlnet/model/transformer/layer_shared/rel_attn/o/kernel',
 'xlnet/model/transformer/layer_shared/rel_attn/LayerNorm/gamma',
 'xlnet/model/transformer/layer_shared/ff/layer_1/kernel',
 'xlnet/model/transformer/layer_shared/ff/layer_1/bias',
 'xlnet/model/transformer/layer_shared/ff/layer_2/kernel',
 'xlnet/model/transformer/layer_shared/ff/layer_2/bias',
 'xlnet/model/transf

In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.io.gfile.exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [25]:
freeze_graph('output-alxlnet-base-keyphrase', strings)

INFO:tensorflow:Restoring parameters from output-alxlnet-base-keyphrase/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 23 variables.
INFO:tensorflow:Converted 23 variables to const ops.
14734 ops in the final graph.


In [26]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [27]:
g = load_graph('output-alxlnet-base-keyphrase/frozen_model.pb')

In [28]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

In [31]:
from tensorflow.tools.graph_transforms import TransformGraph
tf.set_random_seed(0)

In [32]:
pb = 'output-alxlnet-base-keyphrase/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())
    
inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3',
 'Placeholder_4',
 'Placeholder_5',]
outputs = ['xlnet/summary', 'logits']

transformed_graph_def = TransformGraph(input_graph_def, 
                                           inputs,
                                           outputs, transforms)

with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())