In [1]:
# !wget https://f000.backblazeb2.com/file/malaya-model/bert-bahasa/bert-base-2020-10-08.tar.gz
# !tar -zxf bert-base-2020-10-08.tar.gz

In [2]:
import json
import re
import sentencepiece as spm

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [4]:
from xlnet.prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('pretrained-model/preprocess/sp10m.cased.bert.model')

with open('pretrained-model/preprocess/sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [5]:
from malaya.train.model.bigbird import modeling, optimization
import numpy as np
import json
import tensorflow as tf
import itertools
from unidecode import unidecode
import re
import random




In [6]:
emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

with open('/home/husein/sentiment/emotion-twitter-lexicon.json') as fopen:
    emotion = json.load(fopen)
    
emotion.keys()

dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])

In [7]:
texts, labels = [], []

for k, v in emotion.items():
    if len(v) > 30000:
        emotion[k] = random.sample(v, 30000)
    print(k, len(emotion[k]))
    texts.extend(emotion[k])
    labels.extend([emotion_label.index(k)] * len(emotion[k]))

anger 30000
fear 20316
happy 30000
love 20783
sadness 26468
surprise 13107


In [8]:
from malaya.text.rules import normalized_chars
import random

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        word = word.lower()
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [9]:
from tqdm import tqdm

for i in tqdm(range(len(texts))):
    texts[i] = cleaning(texts[i])

100%|██████████| 140674/140674 [00:17<00:00, 7914.78it/s]


In [10]:
actual_t, actual_l = [], []

for i in tqdm(range(len(texts))):
    if len(texts[i]) > 2:
        actual_t.append(texts[i])
        actual_l.append(labels[i])

100%|██████████| 140674/140674 [00:00<00:00, 843786.50it/s]


In [11]:
from tqdm import tqdm

input_ids, input_masks = [], []

for text in tqdm(actual_t):
    tokens_a = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    
    input_ids.append(input_id)
    input_masks.append(input_mask)

100%|██████████| 140674/140674 [00:23<00:00, 5870.85it/s]


In [12]:
maxlen = 512

In [13]:
bert_config = {
    'attention_probs_dropout_prob': 0.2,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.2,
    'hidden_size': 768,
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'max_position_embeddings': 4096,
    'max_encoder_length': maxlen,
    'num_attention_heads': 12,
    'num_hidden_layers': 12,
    'type_vocab_size': 2,
    'scope': 'bert',
    'use_bias': True,
    'rescale_embedding': False,
    'vocab_model_file': None,
    'attention_type': 'block_sparse',
    'norm_type': 'postnorm',
    'block_size': 16,
    'num_rand_blocks': 3,
    'vocab_size': 32000,
}

In [14]:
epoch = 3
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(texts) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
dimension_output = 6

In [15]:
def create_initializer(initializer_range=0.02):
    return tf.truncated_normal_initializer(stddev=initializer_range)

class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
        training = True,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        model = modeling.BertModel(bert_config)
        sequence_output, pooled_output = model(self.X)
        
        output_layer = sequence_output
        output_layer = tf.layers.dense(
            output_layer,
            bert_config['hidden_size'],
            activation=tf.tanh,
            kernel_initializer=create_initializer())
        self.logits_seq = tf.layers.dense(output_layer, dimension_output,
                                         kernel_initializer=create_initializer())
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, 0]
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
dimension_output = 6
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Use keras.layers.Dense instead.


Instructions for updating:
Please use `layer.__call__` method instead.


Instructions for updating:
Please use `layer.__call__` method instead.


























In [17]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        name_r = name.replace('bert/embeddings/LayerNorm', 'bert/encoder/LayerNorm')
        if name_r not in name_to_variable:
            continue
        if 'embeddings/position_embeddings' in name_r:
            continue
        assignment_map[name] = name_to_variable[name_r]
        initialized_variable_names[name_r] = 1
        initialized_variable_names[name_r + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [18]:
tvars = tf.trainable_variables()
checkpoint = 'bert-base/model.ckpt-1000000'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [19]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

INFO:tensorflow:Restoring parameters from bert-base/model.ckpt-1000000


INFO:tensorflow:Restoring parameters from bert-base/model.ckpt-1000000


In [20]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_Y, test_Y, train_mask, test_mask = train_test_split(
    input_ids, actual_l, input_masks, test_size = 0.2
)

In [21]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [22]:
from tqdm import tqdm
import time

for EPOCH in range(epoch):

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post', maxlen = maxlen)
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_x = pad_sequences(batch_x, padding='post', maxlen = maxlen)
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)
    
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 7034/7034 [1:29:53<00:00,  1.30it/s, accuracy=1, cost=0.00145]   
test minibatch loop: 100%|██████████| 1759/1759 [07:39<00:00,  3.83it/s, accuracy=1, cost=0.00173]   
train minibatch loop:   0%|          | 0/7034 [00:00<?, ?it/s]

epoch: 0, training loss: 0.200544, training acc: 0.943293, valid loss: 0.086680, valid acc: 0.983300



train minibatch loop: 100%|██████████| 7034/7034 [1:28:39<00:00,  1.32it/s, accuracy=1, cost=0.000388]  
test minibatch loop: 100%|██████████| 1759/1759 [07:34<00:00,  3.87it/s, accuracy=1, cost=0.000243]  
train minibatch loop:   0%|          | 0/7034 [00:00<?, ?it/s]

epoch: 1, training loss: 0.044958, training acc: 0.990422, valid loss: 0.046701, valid acc: 0.990655



train minibatch loop: 100%|██████████| 7034/7034 [1:28:37<00:00,  1.32it/s, accuracy=1, cost=0.00021]   
test minibatch loop: 100%|██████████| 1759/1759 [07:35<00:00,  3.86it/s, accuracy=1, cost=0.00011]   

epoch: 2, training loss: 0.018654, training acc: 0.996064, valid loss: 0.044817, valid acc: 0.992361






In [23]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bigbird-base-emotion/model.ckpt')

'bigbird-base-emotion/model.ckpt'

In [24]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/encoder/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/self/Softmax',
 'bert/encoder/layer_0/attention/self/Softmax_1',
 'bert/encoder/layer_0/attention/self/Softmax_2',
 'bert/encoder/layer_0/attention/self/Softmax_3',
 'bert/encoder/layer_0/attention/self/Softmax_4',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/

In [26]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_x = pad_sequences(batch_x, padding='post', maxlen=maxlen)
    batch_y = test_Y[i: index]
    
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
            model.X: batch_x,
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 1759/1759 [07:35<00:00,  3.86it/s]


In [27]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.99158   0.98524   0.98840      6099
        fear    0.99004   0.99512   0.99257      4097
       happy    0.99441   0.99703   0.99572      6061
        love    0.99315   0.99097   0.99206      4099
     sadness    0.99245   0.99414   0.99329      5287
    surprise    0.99158   0.99238   0.99198      2492

    accuracy                        0.99236     28135
   macro avg    0.99220   0.99248   0.99234     28135
weighted avg    0.99236   0.99236   0.99235     28135



In [28]:
def freeze_graph(model_dir, output_node_names):

    if not tf.io.gfile.exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [29]:
freeze_graph('bigbird-base-emotion', strings)

INFO:tensorflow:Restoring parameters from bigbird-base-emotion/model.ckpt


INFO:tensorflow:Restoring parameters from bigbird-base-emotion/model.ckpt


Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`


Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`


Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`


Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`


INFO:tensorflow:Froze 203 variables.


INFO:tensorflow:Froze 203 variables.


INFO:tensorflow:Converted 203 variables to const ops.


INFO:tensorflow:Converted 203 variables to const ops.


28727 ops in the final graph.


In [30]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [32]:
# g = load_graph('bigbird-base-emotion/frozen_model.pb')
# x = g.get_tensor_by_name('import/Placeholder:0')
# logits = g.get_tensor_by_name('import/logits:0')
# test_sess = tf.InteractiveSession(graph = g)
# result = test_sess.run(tf.nn.softmax(logits), feed_dict = {x: [input_id]})
# result