# Library 

In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install vncorenlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!mv VnCoreNLP-1.1.1.jar vncorenlp/

!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/


!mkdir -p vncorenlp/models/postagger
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger 
!mv vi-tagger vncorenlp/models/postagger/


!mkdir -p vncorenlp/models/ner
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz 
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz
!mv vi-500brownclusters.xz vncorenlp/models/ner/
!mv vi-ner.xz vncorenlp/models/ner/
!mv vi-pretrainedembeddings.xz vncorenlp/models/ner/

--2023-03-30 10:21:22--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2023-03-30 10:21:23 (280 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2023-03-30 10:21:23--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

# Hyper parameters

In [5]:
max_len = 50
use_tokenizer = False
MODEL = 'xlm-roberta-base'

# Read data

In [6]:
train_dataset = 'drive/MyDrive/CODE/ViTHSD/dataset/train.xlsx'
dev_dataset = 'drive/MyDrive/CODE/ViTHSD/dataset/dev.xlsx'
test_dataset = 'drive/MyDrive/CODE/ViTHSD/dataset/test.xlsx'

In [7]:
import pandas as pd

train = pd.read_csv(train_dataset)
dev = pd.read_csv(dev_dataset)
test = pd.read_csv(test_dataset)

In [8]:
len(train)

7000

In [9]:
len(dev)

1201

In [10]:
len(test)

1800

# EVAL metrics

In [11]:
def precision(pred, y):
    sum = 0
    for i in range(0, len(pred)):
        if y[i] == pred[i] and pred[i] != 0:
            sum = sum + 1 

    pred = [i for i in pred if i != 0]

    return sum / len(pred) if len(pred) > 0 else 0

def recall(pred, y):
    sum = 0
    for i in range(0, len(pred)):
        if y[i] == pred[i] and pred[i] != 0:
            sum = sum + 1

    y = [i for i in y if i != 0]

    return sum / len(y) if len(y) > 0 else 0

def f1(pred, y):
    p = precision(pred, y)
    r = recall(pred, y)
    return 2*p*r / (p+r) if (p+r) != 0 else 0

def precision_score(preds, y):
    result = 0
    for i in range(0, len(preds)):
        result += precision(preds[i], y[i])
    return result / len(preds)

def recall_score(preds, y):
    result = 0
    for i in range(0, len(preds)):
        result += recall(preds[i], y[i])
    return result / len(preds)   

def f1_score(preds, y):
    return 2*precision_score(preds, y)*recall_score(preds, y) / (precision_score(preds, y)+recall_score(preds, y))

In [12]:
a = [2, 1, 0, 2, 1]
b = [2, 2, 0, 2, 1]

f1(a, b)

0.75

# Feature extraction

In [13]:
from vncorenlp import VnCoreNLP
vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg") 

def tokennize_vn(text):
    sentences = vncorenlp.tokenize(text)
    s = ''
    for t in sentences:
        s = s + ' '.join(t) + ' '
    return s

In [14]:
from keras.utils import to_categorical
import numpy as np

def make_data(data):
    content = [str(d) for d in data['content']]
    if use_tokenizer:
        content = [tokennize_vn(str(d)) for d in data['content']]
    # else:
    #     content = data['content'].values
    label = data.iloc[:,2:].values
    label_to_categorical = []
    for d in label:
        lb = []
        for i in d:
            t = np.zeros(4, dtype=int)
            t[i] = 1
            lb.append(t.tolist())
        label_to_categorical.append(lb)

    return content, label, label_to_categorical

In [15]:
train_text, train_gt,  train_label = make_data(train)
dev_text, dev_gt, dev_label = make_data(dev)
test_text, test_gt, test_label = make_data(test)

In [16]:
# from keras.utils import pad_sequences
# from keras.utils import to_categorical
# from keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(train_text)
# vocab_size = len(tokenizer.word_index) + 1

# def encoding(X, y):
#     # if not use_tokenizer:
#     #     X = X.astype(str)
#     X = tokenizer.texts_to_sequences(X)
#     X = pad_sequences(X, maxlen=max_len, padding='post')
#     y = np.asarray([np.asarray(row, dtype=float) for row in y], dtype=int)
#     return (X,y)

In [17]:
# from transformers import AutoTokenizer
# from keras.utils import pad_sequences
# import numpy as np

# tokenizer = AutoTokenizer.from_pretrained(MODEL)

# def encoding(X, y):
#     X = [str(x) for x in X]
#     # X = pad_sequences(X, maxlen=max_len, padding='post')
#     X = np.array(tokenizer(X, max_length=max_len, padding='max_length', truncation=True)['input_ids'])
#     y = np.asarray([np.asarray(row, dtype=float) for row in y], dtype=int)
#     return (X,y)

In [18]:
# import pickle

# # saving
# with open('drive/MyDrive/CODE/ViTHSD/model/tokenizer2.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# train_features = encoding(train_text, train_label)
# dev_features = encoding(dev_text, dev_label)
# test_features = encoding(test_text, test_label)

# Model 

In [20]:
from transformers import AutoTokenizer
from keras.utils import pad_sequences
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def encoding(X, y):
    X = [str(x) for x in X]
    X = {
        "input_ids": np.asarray(tokenizer(X, max_length=max_len, padding='max_length', truncation=True)['input_ids']),
        # "token_type_ids": np.asarray(tokenizer(X, max_length=max_len, padding='max_length', truncation=True)['token_type_ids']),
        "attention_mask": np.asarray(tokenizer(X, max_length=max_len, padding='max_length', truncation=True)['attention_mask'])
    }
    # X = tokenizer(X, max_length=max_len, padding='max_length', truncation=True)
    y = np.asarray([np.asarray(row, dtype=float) for row in y], dtype=int)
    return (X,y)

In [21]:
train_features = encoding(train_text, train_label)
dev_features = encoding(dev_text, dev_label)
test_features = encoding(test_text, test_label)

## Build model

In [22]:
# from transformers import AutoModel

# model = AutoModel.from_pretrained(MODEL)
# embedding_matrix = model.embeddings.word_embeddings.weight.detach().numpy()
# # dim = model.embeddings.word_embeddings.embedding_dim
# # vocab = model.embeddings.word_embeddings.num_embeddings
# dim = embedding_matrix.shape[1]
# vocab = embedding_matrix.shape[0]

In [23]:
from transformers import TFAutoModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, concatenate
from tensorflow.keras.optimizers import Adam

from keras.layers import Dense, Concatenate, Bidirectional, GRU, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, Dropout, concatenate
from keras.layers import Flatten, LSTM, Input, Reshape

units = 112
NUM_LAYER = 4
filter_size = 5

inputs = {
        'input_ids'     : Input((max_len,), dtype='int32', name='input_ids'), 
        # 'token_type_ids': Input((max_len,), dtype='int32', name='token_type_ids'), 
        'attention_mask': Input((max_len,), dtype='int32', name='attention_mask'),
}
pretrained_bert = TFAutoModel.from_pretrained(MODEL, output_hidden_states=True, from_pt=True)
hidden_states = pretrained_bert(inputs).hidden_states

# pooled_output = concatenate(
#         tuple([hidden_states[i] for i in range(-4, 0)]), 
#         name = 'last_4_hidden_states',
#         axis = -1
# )[:, 0, :]
pooled_output = concatenate(
        tuple([hidden_states[i] for i in range(-NUM_LAYER, 0)]), 
        name = 'last_4_hidden_states',
        axis = -1
)
x = Dropout(0.2)(pooled_output)

l1 = Bidirectional(GRU(units, return_sequences = True))(x)
l1 = Conv1D(filter_size, kernel_size = 5, padding = "valid", kernel_initializer = "he_uniform")(l1)
    
l2 = Bidirectional(LSTM(units, return_sequences = True))(x)
l2 = Conv1D(filter_size, kernel_size = 5, padding = "valid", kernel_initializer = "he_uniform")(l2)
    
avg_pool1 = GlobalAveragePooling1D()(l1)
max_pool1 = GlobalMaxPooling1D()(l1)
    
avg_pool2 = GlobalAveragePooling1D()(l2)
max_pool2 = GlobalMaxPooling1D()(l2)
    
    
x = Concatenate(axis=-1)([avg_pool1, max_pool1, avg_pool2, max_pool2])

outputs = concatenate([
        Dense(units = 4, activation = 'softmax',name = "target1")(x),
        Dense(units = 4, activation = 'softmax',name = "target2")(x),
        Dense(units = 4, activation = 'softmax',name = "target3")(x),
        Dense(units = 4, activation = 'softmax',name = "target4")(x),
        Dense(units = 4, activation = 'softmax',name = "target5")(x)
], axis = -1)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics="binary_accuracy")

model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 tfxlm_roberta_model (TFXLMRobe  TFBaseModelOutputWi  278043648  ['attention_mask[0][0]',         
 rtaModel)                      thPoolingAndCrossAt               'input_ids[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                            

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

history = model.fit(train_features[0], train_features[1].reshape(-1, 20), 
                    validation_data=(dev_features[0], dev_features[1].reshape(-1, 20)), 
                    batch_size=64, epochs=30, verbose=True,
                    # callbacks=[early_stopping]
                )

Epoch 1/30






In [None]:
y_pred = model.predict(test_features[0])

## Evaluation

In [None]:
import numpy as np

pred = np.argmax(y_pred.reshape(-1, 5, 4), axis=-1)

In [None]:
print("Aspect + Polarity\n")
print("F1 = {}".format(f1_score(pred, test_gt)))
print("Precision = {}".format(precision_score(pred, test_gt)))
print("Recall = {}".format(recall_score(pred, test_gt)))

In [None]:
# Aspect only 

new_pred = []
new_y = []

for k in pred:
    t = []
    for i in k:
        if i > 0:
            t.append(1)
        else:
            t.append(i)
    new_pred.append(t)

for k in test_gt:
    t = []
    for i in k:
        if i > 0:
            t.append(1)
        else:
            t.append(i)
    new_y.append(t)

In [None]:
print("Aspect\n")
print("F1 = {}".format(f1_score(new_pred, new_y)))
print("Precision = {}".format(precision_score(new_pred, new_y)))
print("Recall = {}".format(recall_score(new_pred, new_y)))

In [None]:
# model.save("drive/MyDrive/CODE/ViTHSD/model/{}.h5".format(MODEL))
model.save("drive/MyDrive/CODE/ViTHSD/model/bigrulstmcnn_xlmr.h5".format(MODEL))

In [None]:
# from sklearn.metrics import classification_report
# tg = 4
# print(classification_report(pred[:,tg], test_gt[:,tg]))

# Results

In [None]:
import matplotlib.pyplot as plt

def plot_metric(history, metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics)
    plt.plot(epochs, val_metrics)
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()

In [None]:
plot_metric(history, 'loss')