In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import bert
from bert import optimization
from bert import tokenization
from bert import modeling
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from unidecode import unidecode
from sklearn.utils import shuffle
from prepro_utils import preprocess_text, encode_ids, encode_pieces
from malaya.text.function import transformer_textcleaning as cleaning
from tensorflow.python.estimator.run_config import RunConfig




In [3]:
with open('/home/husein/alxlnet/topics.json') as fopen:
    topics = set(json.load(fopen).keys())

list_topics = list(topics)

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}


class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass

    def tokenize(self, string):
        return encode_pieces(
            sp_model, string, return_unicode = False, sample = False
        )

    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]

    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]


tokenizer = Tokenizer(v)

In [4]:
def F(text):
    tokens_a = tokenizer.tokenize(text)
    tokens = ['[CLS]'] + tokens_a + ['[SEP]']
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    return input_id, input_mask


def XY(data):

    if len(set(data[1]) & topics) and random.random() > 0.2:
        t = random.choice(data[1])
        label = 1
    else:
        s = set(data[1]) | set()
        t = random.choice(list(topics - s))
        label = 0
    X = F(cleaning(data[0]))
    Y = F(t)

    return X, Y, label

In [5]:
with open('/home/husein/alxlnet/testset-keyphrase.json') as fopen:
    data = json.load(fopen)

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
def create_initializer(initializer_range = 0.02):
    return tf.compat.v1.truncated_normal_initializer(stddev = initializer_range)


def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.compat.v1.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if 'bert/' + name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable['bert/' + name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [8]:
batch_size = 60
warmup_proportion = 0.1
num_train_steps = 1000000
num_warmup_steps = int(num_train_steps * warmup_proportion)
learning_rate = 2e-5

In [10]:
bert_config = modeling.BertConfig.from_json_file(
    'tiny-bert-v1/config.json'
)

class Model:
    def __init__(
        self,
        dimension_output = 2,
    ):
        self.X = @@#placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks = @@#placeholder(tf.compat.v1.float32, [None, None])
        
        self.X_b = @@#placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks_b = @@#placeholder(tf.compat.v1.float32, [None, None])
        
        self.Y = @@#placeholder(tf.compat.v1.int32, [None])
        
        with @@#variable_scope('bert', reuse = False):
            model = modeling.BertModel(
                config = bert_config,
                is_training = True,
                input_ids = self.X,
                input_mask = self.input_masks,
                use_one_hot_embeddings = False,
            )

            summary = model.get_pooled_output()
            summary = tf.compat.v1.identity(summary, name = 'summary')
            self.summary = summary
            
        with @@#variable_scope('bert', reuse = True):
            model = modeling.BertModel(
                config = bert_config,
                is_training = True,
                input_ids = self.X_b,
                input_mask = self.input_masks_b,
                use_one_hot_embeddings = False,
            )

            summary_b = model.get_pooled_output()
        
        vectors_concat = [summary, summary_b, tf.compat.v1.abs(summary - summary_b)]
        vectors_concat = tf.compat.v1.concat(vectors_concat, axis = 1)
        
        self.logits = tf.compat.v1.layers.dense(vectors_concat, dimension_output)
        self.logits = tf.compat.v1.identity(self.logits, name = 'logits')
        
        self.cost = tf.compat.v1.reduce_mean(
            tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        correct_pred = tf.compat.v1.equal(
            tf.compat.v1.argmax(self.logits, 1, output_type = tf.compat.v1.int32), self.Y
        )
        self.accuracy = tf.compat.v1.reduce_mean(tf.compat.v1.cast(correct_pred, tf.compat.v1.float32))

In [11]:
dimension_output = 2

tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.InteractiveSession()
model = Model(
    dimension_output,
)

sess.run(tf.compat.v1.global_variables_initializer())




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [12]:
checkpoint = 'tiny-bert-keyphrase/model.ckpt-620000'
saver = tf.compat.v1.train.Saver(var_list = tf.compat.v1.trainable_variables())
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from tiny-bert-keyphrase/model.ckpt-620000


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
X = F('Kementerian Pertanian dan Industri Makanan menggalakkan pemain industri pertanian menceburi tanaman penting bagi mengurangkan kebergantungan bahan import dari luar negara')

o1 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [X[0]],
        model.input_masks: [X[1]],
    },
)

Y = F('tanaman jagung')

o2 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [Y[0]],
        model.input_masks: [Y[1]],
    },
)

sess.run(
    model.logits,
    feed_dict = {
        model.X: [X[0]],
        model.input_masks: [X[1]],
        model.X_b: [Y[0]],
        model.input_masks_b: [Y[1]],
    },
)

array([[ 3.5177658, -4.316416 ]], dtype=float32)

In [15]:
cosine_similarity(o1, o2)

array([[0.24081138]], dtype=float32)

In [None]:
from tqdm import tqdm

batch_size = 32
real_Y,predict_Y = [], []

for i in tqdm(range(0, len(data), batch_size)):
    batch = data[i: i + batch_size]
    X, mask, X_b, mask_b = [], [], [], []
    for k in range(len(batch)):
        x = F(batch[k][0])
        y = F(batch[k][1])
        X.append(x[0])
        mask.append(x[1])
        X_b.append(y[0])
        mask_b.append(y[1])
        
    X = pad_sequences(X, padding = 'post')
    mask = pad_sequences(mask, padding = 'post')
    X_b = pad_sequences(X_b, padding = 'post')
    mask_b = pad_sequences(mask_b, padding = 'post')
    
    batch_y = [b[2] for b in batch]
    
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.X: X,
                model.input_masks: mask,
                model.X_b: X_b,
                model.input_masks_b: mask_b,
            },
    ), 1, ).tolist()
    
    real_Y += batch_y

 17%|█▋        | 105/625 [00:02<00:10, 49.84it/s]

In [None]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar'],
        digits = 5
    )
)