In [None]:
import os

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedShuffleSplit
import scikitplot as skplt
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import bert
import pickle
import keras as K
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model

import Utils.data_utils as data

In [None]:
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
def createTokenizer(model_path):
    """This function aims to create a tokenizer specific to the format of BERT models. 
    BERT models contain different files, one of which is vocab.txt
    - vocab: vocab.txt path"""
    return bert.bert_tokenization.FullTokenizer(model_path+'/vocab.txt', do_lower_case=True)
    
def format_text(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) > max_seq_len:
        tokens = tokens[len(tokens)-max_seq_len:]
        
    input_sequence = ["[CLS]"]+tokens[:max_seq_len-2]+["[SEP]"]
    pad_len = max_seq_len-len(input_sequence)
    return np.asarray(tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len).astype('int32')

def save_processed(t_save_path, s_save_path, split=False):
    """
    t Order: Xtrain, ytrain, Xdev, ydeb, Xtest, ytest
    s Order: X_train, y_train, X_unl, y_unl
    """
    with open(t_save_path,'wb') as f: pickle.dump([Xtrain, ytrain, Xdev, ydev, Xtest, ytest], f)
    if split:
        with open(s_save_path,'wb') as f: pickle.dump([X_train, y_train, X_unl, y_unl], f)

def load_processed(t_save_path, s_save_path, split=False):
    """
    t Order: Xtrain, ytrain, Xdev, ydeb, Xtest, ytest
    s Order: X_train, y_train, X_unl, y_unl
    """
    with open(t_save_path,'rb') as f: t = pickle.load(f)
    if split:
        with open(s_save_path,'rb') as f: s = pickle.load(f)
        return t, s
    return t

In [None]:
max_seq_len = 250
path_bert_model = './BERT-models/BERT-Mini'
# path_bert_model = './BERT-models/BERT-Mini'

path_train = 'dataset/Raw/3c_Train.csv'
path_test = 'dataset/Raw/3c_Test.csv'

dataset = data.Dataset()
dataset.load_csv(path_train, path_test, label_name='blabel', separator='\t')
dataset.make_dev_split(dev_split=0.15)
fig = dataset.classes_distribution()

## Load processed Data

In [None]:
t, s = load_processed("../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/5percent/Total", \
               "../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/5percent/Split", split=True)
Xtrain, ytrain, Xdev, ydev, Xtest, ytest = t
X_train, y_train, X_unl, y_unl = s

## Data preprocessing (Only if processed data is not available)

In [None]:
dataset.get_train().review = dataset.get_train().review.replace('"','', regex=True)
dataset.get_dev().review = dataset.get_dev().review.replace('"','', regex=True)
dataset.get_test().review = dataset.get_test().review.replace('"','', regex=True)

tokenizer = createTokenizer(path_bert_model)

Xtrain = np.asarray([format_text(text, tokenizer) for text in dataset.get_train().review])
Xtest = np.asarray([format_text(text, tokenizer) for text in dataset.get_test().review])
Xdev = np.asarray([format_text(text, tokenizer) for text in dataset.get_dev().review])

ytrain = to_categorical(dataset.get_train_y()-dataset.get_train_y().min())
ytest = to_categorical(dataset.get_test_y()-dataset.get_test_y().min())
ydev = to_categorical(dataset.get_dev_y()-dataset.get_dev_y().min())

In [None]:
unl_ratio = 0.80 # 1 - unl_ratio = % train
sss = StratifiedShuffleSplit(n_splits=2, test_size=unl_ratio, random_state=0)
for train_index, test_index in sss.split(Xtrain, ytrain):
    X_train, X_unl = Xtrain[train_index], Xtrain[test_index]
    y_train, y_unl = ytrain[train_index], ytrain[test_index]

In [None]:
save_processed("../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/20percent/Total", "../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/20percent/Split", split=True)

In [None]:
t, s = load_processed("dataset/splitedData/3c_BERT_processed_Data/1percent/Total", \
               "dataset/splitedData/3c_BERT_processed_Data/1percent/Split", split=True)
Xtrain, ytrain, Xdev, ydev, Xtest, ytest = t
X_train, y_train, X_unl, y_unl = s

## Define Model

In [None]:
checkpoint_filepath = './model/5percent_3c_CLS_aBERT_Best_EMRS_model_f1_Mini'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_f1_score',
    mode='max',
    save_best_only=True)

In [None]:
l_input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

bert_params = bert.params_from_pretrained_ckpt(path_bert_model)
bert_params.adapter_size = 4
bert_params.adapter_init_scale = 1e-5
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

embedded_sequences = l_bert(l_input_ids) # output: [batch_size, max_seq_len, hidden_size]
drop_out_1 = keras.layers.Dropout(0.3, name='drop_out_1')(embedded_sequences)

biLSTM_1 = keras.layers.Bidirectional(keras.layers.LSTM(250))(drop_out_1)

drop_out_2 = keras.layers.Dropout(0.3, name='drop_out_2')(biLSTM_1)
mp_dense = keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(), name='mp_dense')(drop_out_2)
preds = keras.layers.Dense(3, activation='softmax', name='preds')(mp_dense)

model = keras.Model(inputs=l_input_ids, outputs=preds)
model.build(input_shape=(None,max_seq_len))

l_bert.apply_adapter_freeze()

model.compile(loss='categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy',tfa.metrics.F1Score(num_classes=3, average='macro')])

#callbacks_list = [metrics]
# l_bert.apply_adapter_freeze()
bert_ckpt_file = os.path.join(path_bert_model, "bert_model.ckpt")
bert.load_stock_weights(l_bert, bert_ckpt_file)

## Train model

In [None]:
model.fit(X_train, y_train,
      #class_weight=class_weight_dict,
      shuffle=True,
      batch_size=32,
      epochs=200,
      #callbacks=[model_checkpoint_callback],
      validation_data=(Xdev, ydev))

## Evaluate model on test dataset

In [None]:
y_test = ytest.argmax(axis=1)
result = model.predict(Xtest).argmax(axis=-1)
print(classification_report(y_test, result, digits=4))

## Load last saved model

In [None]:
last_model = keras.models.load_model(checkpoint_filepath, custom_objects={'F1Score': tfa.metrics.F1Score(num_classes=3, average='macro')})

In [None]:
probas = last_model.predict(Xtest)
result = probas.argmax(axis=-1)
print(classification_report(y_test, result, digits=4))