In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/keyphrase/keyphrase-twitter-no-calon.json
# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/keyphrase/twitter-bahasa/topics.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import json

with open('topics.json') as fopen:
    topics = set(json.load(fopen).keys())
    
list_topics = list(topics)

In [4]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils
import random




In [5]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

True

In [6]:
from malaya.text.function import transformer_textcleaning as cleaning

In [7]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [8]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

def F(left_train):
    tokens_a = tokenize_fn(left_train)
    segment_id = [SEG_ID_A] * len(tokens_a)
    tokens_a.append(SEP_ID)
    tokens_a.append(CLS_ID)
    segment_id.append(SEG_ID_A)
    segment_id.append(SEG_ID_CLS)
    input_mask = [0] * len(tokens_a)
    return tokens_a, segment_id, input_mask

def XY(data):
    
    if len(set(data[1]) & topics) and random.random() > 0.2:
        t = random.choice(data[1])
        label = 1
    else:
        s = (set(data[1]) | set())
        t = random.choice(list(topics - s))
        label = 0
    X = F(cleaning(data[0]))
    Y = F(t)
    
    return X, Y, label

In [9]:
with open('testset-keyphrase.json') as fopen:
    data = json.load(fopen)

In [10]:
data[0]

['Takdak gambar raya ', 'myburgerlab restaurant', 0]

In [11]:
F(data[0][1])

([284, 8751, 194, 8038, 5556, 1356, 7200, 4, 3],
 [0, 0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
class Parameter:
    def __init__(
        self,
        decay_method,
        warmup_steps,
        weight_decay,
        adam_epsilon,
        num_core_per_host,
        lr_layer_decay_rate,
        use_tpu,
        learning_rate,
        train_steps,
        min_lr_ratio,
        clip,
        **kwargs
    ):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip

num_train_steps = 300000
warmup_proportion = 0.1
num_warmup_steps = int(num_train_steps * warmup_proportion)
initial_learning_rate = 2e-5

kwargs = dict(
    is_training = True,
    use_tpu = False,
    use_bfloat16 = False,
    dropout = 0.1,
    dropatt = 0.1,
    init = 'normal',
    init_range = 0.1,
    init_std = 0.05,
    clamp_len = -1,
)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(
    json_path = 'alxlnet-base-2020-04-10/config.json'
)
training_parameters = dict(
    decay_method = 'poly',
    train_steps = num_train_steps,
    learning_rate = initial_learning_rate,
    warmup_steps = num_warmup_steps,
    min_lr_ratio = 0.0,
    weight_decay = 0.00,
    adam_epsilon = 1e-8,
    num_core_per_host = 1,
    lr_layer_decay_rate = 1,
    use_tpu = False,
    use_bfloat16 = False,
    dropout = 0.1,
    dropatt = 0.1,
    init = 'normal',
    init_range = 0.1,
    init_std = 0.05,
    clip = 1.0,
    clamp_len = -1,
)
training_parameters = Parameter(**training_parameters)




In [14]:
class Model:
    def __init__(
        self,
        dimension_output = 2,
    ):
        self.X = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.segment_ids = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks = tf.compat.v1.placeholder(tf.compat.v1.float32, [None, None])
        
        self.X_b = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.segment_ids_b = tf.compat.v1.placeholder(tf.compat.v1.int32, [None, None])
        self.input_masks_b = tf.compat.v1.placeholder(tf.compat.v1.float32, [None, None])
        
        self.Y = tf.compat.v1.placeholder(tf.compat.v1.int32, [None])
        
        with tf.compat.v1.variable_scope('xlnet', reuse = False):
            xlnet_model = xlnet.XLNetModel(
                xlnet_config=xlnet_config,
                run_config=xlnet_parameters,
                input_ids=tf.compat.v1.transpose(self.X, [1, 0]),
                seg_ids=tf.compat.v1.transpose(self.segment_ids, [1, 0]),
                input_mask=tf.compat.v1.transpose(self.input_masks, [1, 0]))

            summary = xlnet_model.get_pooled_out("last", True)
            summary = tf.compat.v1.identity(summary, name = 'summary')
            self.summary = summary
            print(summary)
            
        with tf.compat.v1.variable_scope('xlnet', reuse = True):
            xlnet_model = xlnet.XLNetModel(
                xlnet_config=xlnet_config,
                run_config=xlnet_parameters,
                input_ids=tf.compat.v1.transpose(self.X_b, [1, 0]),
                seg_ids=tf.compat.v1.transpose(self.segment_ids_b, [1, 0]),
                input_mask=tf.compat.v1.transpose(self.input_masks_b, [1, 0]))
            summary_b = xlnet_model.get_pooled_out("last", True)
        
        vectors_concat = [summary, summary_b, tf.compat.v1.abs(summary - summary_b)]
        vectors_concat = tf.compat.v1.concat(vectors_concat, axis = 1)
        
        self.logits = tf.layers.dense(vectors_concat, dimension_output)
        self.logits = tf.compat.v1.identity(self.logits, name = 'logits')
        
        self.cost = tf.compat.v1.reduce_mean(
            tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        correct_pred = tf.compat.v1.equal(
            tf.compat.v1.argmax(self.logits, 1, output_type = tf.compat.v1.int32), self.Y
        )
        self.accuracy = tf.compat.v1.reduce_mean(tf.compat.v1.cast(correct_pred, tf.compat.v1.float32))

In [15]:
dimension_output = 2

tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.InteractiveSession()
model = Model(
    dimension_output,
)

sess.run(tf.compat.v1.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Tensor("xlnet/summary:0", shape=(?, 768), dtype=float32)
INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>


In [16]:
tvars = tf.compat.v1.trainable_variables()
checkpoint = 'alxlnet-base-keyphrase/model.ckpt-180000'

In [17]:
saver = tf.compat.v1.train.Saver(var_list = tf.compat.v1.trainable_variables())
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from alxlnet-base-keyphrase/model.ckpt-180000


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
X = F('Kementerian Pertanian dan Industri Makanan menggalakkan pemain industri pertanian menceburi tanaman penting bagi mengurangkan kebergantungan bahan import dari luar negara')

o1 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [X[0]],
        model.segment_ids: [X[1]],
        model.input_masks: [X[2]],
    },
)

Y = F('tanaman jagung')

o2 = sess.run(
    model.summary,
    feed_dict = {
        model.X: [Y[0]],
        model.segment_ids: [Y[1]],
        model.input_masks: [Y[2]],
    },
)

sess.run(
    model.logits,
    feed_dict = {
        model.X: [X[0]],
        model.segment_ids: [X[1]],
        model.input_masks: [X[2]],
        model.X_b: [Y[0]],
        model.segment_ids_b: [Y[1]],
        model.input_masks_b: [Y[2]],
    },
)

array([[ 2.6460671 , -0.13614686]], dtype=float32)

In [20]:
cosine_similarity(o1, o2)

array([[0.35625386]], dtype=float32)

In [21]:
from tqdm import tqdm

batch_size = 32
real_Y,predict_Y = [], []

for i in tqdm(range(0, len(data), batch_size)):
    batch = data[i: i + batch_size]
    X, segment, mask, X_b, segment_b, mask_b = [], [], [], [], [], []
    for k in range(len(batch)):
        x = F(batch[k][0])
        y = F(batch[k][1])
        X.append(x[0])
        segment.append(x[1])
        mask.append(x[2])
        X_b.append(y[0])
        segment_b.append(y[1])
        mask_b.append(y[2])
        
    X = pad_sequences(X, padding = 'post')
    segment = pad_sequences(segment, padding = 'post', value = 1)
    mask = pad_sequences(mask, padding = 'post', value = 4)
    X_b = pad_sequences(X_b, padding = 'post')
    segment_b = pad_sequences(segment_b, padding = 'post', value = 1)
    mask_b = pad_sequences(mask_b, padding = 'post', value = 4)
    
    batch_y = [b[2] for b in batch]
    
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.X: X,
                model.segment_ids: segment,
                model.input_masks: mask,
                model.X_b: X_b,
                model.segment_ids_b: segment_b,
                model.input_masks_b: mask_b,
            },
    ), 1, ).tolist()
    
    real_Y += batch_y

100%|██████████| 625/625 [01:38<00:00,  6.36it/s]


In [22]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar'],
        digits = 5
    )
)

              precision    recall  f1-score   support

 not similar    0.99846   0.98837   0.99339     15133
     similar    0.96494   0.99527   0.97987      4867

    accuracy                        0.99005     20000
   macro avg    0.98170   0.99182   0.98663     20000
weighted avg    0.99031   0.99005   0.99010     20000

