In [None]:
!pip install transformers text-hammer pyreadstat

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup , AutoTokenizer, TFAutoModel, TFRobertaModel
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, f1_score
import matplotlib.pyplot as plt
import os
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm
from collections import defaultdict
import text_hammer as th
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import transformers
import random as rd
import keras.backend as K
from numpy.random import seed
from tensorflow.keras import layers
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import multilabel_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# identify and specify the GPU as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

### Functions

In [None]:
#------------------------STEP 1 --------------------------------#

# Metric1 construction to compute the % of success tweet by tweet

def metric1(y_true, y_pred):
    n = K.cast(K.shape(y_true)[0], 'float32')
    y_pred_r = K.round(y_pred)
    res = K.all(K.equal(y_true, y_pred_r), axis=1)
    res = K.cast(res, 'float32')
    res = K.sum(res) / n
    return res

#------------------------STEP 2 --------------------------------#

# Metric2 construction to compute the % of success label by label
# (equal to binary-accuracy)

def metric2(y_true, y_pred):
    n = K.cast(K.shape(y_true)[0], 'float32')
    y_pred_r = K.round(y_pred)
    res = abs(y_true - y_pred_r)
    count_error = K.sum(K.cast(K.equal(res, 1), 'float32'),axis=0)
    res = 1-count_error / n
    res = K.mean(res)
    return res

In [None]:
def ConfusionMatrix(y_true , y_pred):
    cm = multilabel_confusion_matrix(y_true , y_pred)
    return cm/y_true.shape[0]

In [None]:
def metric2_per_labels(M): # Takes a matrix containing the absolute difference between true value and prediction
    for i in range(M.shape[1]):
        s = np.sum(M,axis = 0) # Summing per column
        s = 1-s/M.shape[0] # % of good predictions per column
        s = np.round(s,3)
    return s

In [None]:
def get_clean(x):
    mention = r'@\w+'
    hash = r'#\w+'
    x = str(x).lower().replace('\\', '').replace('_', '')
    x = re.sub(r'[^\x00-\x7F]+', ' ', x)
    x = th.cont_exp(x)
    x = th.remove_emails(x)
    x = th.remove_urls(x)
    x = re.sub(mention, ' ', x)
    x = re.sub(hash, ' ', x)
    x = th.remove_html_tags(x)
    x = th.remove_rt(x)
    x = th.remove_accented_chars(x)
    x = th.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(r'\s+', ' ', x).strip()
    x = re.sub(r'\w*\d+\w*', ' ', x).strip()
    return x

### Data

In [None]:
df = pd.read_spss("/content/drive/MyDrive/VA_EN_TU_2012-2020_3000_tweets_relevant_V03_labeled_1200_cleaned.sav")
data = df[['text', 'Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].copy()
data.head()

In [None]:
data['cleaned_text'] = data['text'].apply(get_clean)

In [None]:
## Individual Label Distribution
label_sums = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].sum(axis=0)
labels = ['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']

# label_counts = data[['label_1_positive', 'label_2_negative', 'label_3_neutral']].sum()
# print(label_counts)

plt.figure(figsize=(10,6))
plt.bar(labels, label_sums)
plt.ylabel('Number of tweets')
plt.title('Individual Label Distribution')
plt.show()

## Label Combinations Distribution
df_train_labels = pd.DataFrame(data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']], columns=labels)
combinations = df_train_labels.groupby(labels).size().reset_index().rename(columns={0:'count'})


combinations = combinations.sort_values(by='count', ascending=False) # by count to get most frequent combinations
combinations['Label Combination'] = combinations[labels].astype(int).astype(str).agg(','.join, axis=1)

plt.figure(figsize=(12,8))
combinations.plot(x='Label Combination', y='count', kind='bar', legend=False)
plt.title('Label Combinations Distribution')
plt.ylabel('Number of tweets')
plt.xticks(rotation=45)
plt.show()


### RoBERTa model + 2 layers

In [None]:
# Constants

nb_samples = len(data) # Number of tweets
proportion_training = 0.80 # 80/20 ratio : 80% of training data for k-fold validation, 20% for test data
pretrained_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" # Roberta model for sentiment
seq_len = 128 # tweet length
batch_size = 80
learning_rate = 0.001
nb_epoch = 60
nb_fold = 3
earlyStopPatience = 10

In [None]:
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=seq_len,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   #return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [None]:
# Preprocessing all data with Roberta's tokenizer
X_ids = np.zeros((len(data),seq_len))
X_mask = np.zeros((len(data),seq_len))

for i, sentence in enumerate(data['text']):
    X_ids[i, :], X_mask[i, :] = tokenize(sentence)

In [None]:
sep = round(proportion_training * len(data))
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values
# Creating training data
X_ids_train = X_ids[:sep,:]
X_mask_train = X_mask[:sep,:]
Y_train = labels[:sep,:]

# Creating test data
X_ids_test = X_ids[sep:,:]
X_mask_test = X_mask[sep:,:]
Y_test = labels[sep:,:]

In [None]:
# Loading pretrained model
pretrained_model = TFAutoModel.from_pretrained(pretrained_model_name)


In [None]:
def get_model():

    input_text_ids = keras.Input(shape=(seq_len,), dtype='int32', name='input_text_ids')
    text_mask = keras.Input(shape=(seq_len,), dtype='int32', name='attention_mask_text')

    embeddings = pretrained_model(input_text_ids, attention_mask=text_mask)[1]

    x = tf.keras.layers.Dense(128, activation='relu')(embeddings)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.35)(x)

    predictions = layers.Dense(3, activation="sigmoid", name="predictions")(x)

    model = keras.Model(inputs=[input_text_ids,text_mask], outputs=predictions)

    model.layers[2].trainable = False # freeze pretrained model layer

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                loss="binary_crossentropy",
                metrics=[metric1,metric2])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
def fit_the_model(model, X_tr, y_tr, X_val, y_val, num):
    # The model is trained on data input X_tr, y_tr, having X_val, y_val as validation data
    callbacks = [
        EarlyStopping(monitor="val_metric2", patience=earlyStopPatience, mode="max"),
        ModelCheckpoint(filepath="MODEL/best_model"+str(num)+".hdf5", monitor="val_metric2", mode='max', save_best_only=True, verbose=1)
    ]
    model.fit(X_tr, y_tr, validation_data = (X_val,y_val), epochs=nb_epoch, verbose=1, callbacks=callbacks, batch_size=batch_size)
    return model

In [None]:
list_of_models = [] # To save models
list_history = [] # To save model efficiency

kfold = KFold(n_splits=nb_fold, shuffle=True, random_state=42)

# Training with k-fold method
k = 1
for train_index, val_index in kfold.split(X_ids_train,Y_train):
    print(f'\nTraining model {k}...')
    model = get_model() # Model construction
    history = fit_the_model(model,
                            [X_ids_train[train_index],
                             X_mask_train[train_index]],
                            Y_train[train_index],
                            [X_ids_train[val_index],
                             X_mask_train[val_index]],
                            Y_train[val_index], k) # Model training
    list_of_models.append(history)
    list_history.append(history.history.history)
    k += 1

print("---Finished---")

In [None]:
for i in range(nb_fold):
    print(50*'-',f"MODEL PERFORMANCE {i+1}",50*'-')

    average_loss = list_history[i]['loss']
    average_val_loss = list_history[i]['val_loss']

    average_metric_1 = list_history[i]['metric1']
    average_val_metric_1 = list_history[i]['val_metric1']

    average_metric_2 = list_history[i]['metric2']
    average_val_metric_2 = list_history[i]['val_metric2']

    plt.figure(figsize=(16,4))

    # Displays model loss and efficiency evolutions for each model
    txtTitres = ["Training and validation loss","Training and validation metric 1","Training and validation metric 2"]

    txtYLabel = ["Loss","Metric 1 ","Metric 2"]

    L1 = [average_loss, average_metric_1 , average_metric_2]
    L2 = [average_val_loss, average_val_metric_1 , average_val_metric_2]

    for k in range(3):
        if len(L1[0]) < nb_epoch: # to avoid dimension errors when plotting
            nb_epoch_plot=len(L1[0])
        else:
            nb_epoch_plot=nb_epoch
        Epochs = range(1, nb_epoch_plot+1)
        plt.subplot(1,3,k+1)
        plt.plot(Epochs, L1[k], color='b', marker='o', label="training")
        plt.plot(Epochs, L2[k], color='r', marker='o', label="validation")
        plt.grid()
        plt.title(txtTitres[k])
        plt.xlabel("Epochs")
        plt.ylabel(txtYLabel[k])
        plt.legend()
    plt.show()

In [None]:
# Loading the best model for each fold, our final model will be the "average" of all of them
L_best_models = []
for i in range(nb_fold):
    path = "MODEL/best_model" + str(i+1) + ".hdf5"
    L_best_models.append(keras.models.load_model(path,
                                                 custom_objects={"TFRobertaModel": pretrained_model,
                                                                 "metric1": metric1,
                                                                 "metric2":metric2},
                                                 compile=False))

In [None]:
# Computing prediction of final model on validation data, accessible via each fold
L_predictions = []
L_cm = []
L_f1 = []
L_m1, L_m2, L_m3 = [], [], []

k = 0

# To evaluate our model which prediction will be the median of all predictions, we'll do an
# average of all scores obtained by each model on their own validation_data
kfold = KFold(n_splits=nb_fold, shuffle=True, random_state=42)
for train_index, val_index in kfold.split(X_ids_train,Y_train):

    prediction = np.round(L_best_models[k].predict([X_ids_train[val_index], X_mask_train[val_index]])) # prediction for this fold's validation data

    cm = ConfusionMatrix(Y_train[val_index] , prediction)
    f1 = f1_score(Y_train[val_index] , prediction, average=None)
    m1 = metric1(Y_train[val_index], prediction)
    m2 = metric2(Y_train[val_index], prediction)
    diff = np.abs(prediction-Y_train[val_index])
    m3 = metric2_per_labels(diff)

    L_predictions.append(prediction)
    L_cm.append(cm)
    L_f1.append(f1)
    L_m1.append(m1)
    L_m2.append(m2)
    L_m3.append(m3)
    k += 1

In [None]:
# Summary of the model's performances on validation data
print(40*'-',f"SUMMARY OF MODEL PERFORMANCE ON VALIDATION DATA",20*'-',"\n")

# Average best metrics
m1 = np.mean(L_m1,axis=0)
m2 = np.mean(L_m2,axis=0)
m3 = np.mean(L_m3,axis=0)

print("metric1 validation mean :", m1, "\n")
print("metric2 validation mean :", m2, "\n")
print("metric2 validation mean per labels :", m3, "\n")

# Average F1 score
print("Average F1 score on negative/neutral/positive: ", np.mean(L_f1,axis=0))

# Average confusion matrices
avg_cm = np.mean(L_cm,axis=0)

L_titles = [ 'Confusion Matrix for Negative label', 'Confusion Matrix for Neutral label','Confusion Matrix for Positive label']
L_labels = [ 'Not Negative', 'Negative', 'Not Neutral', 'Neutral','Not Positive', 'Positive']

k = 0
for i in range(3):
    plt.figure(figsize=(3,3))
    ax = sns.heatmap(avg_cm[i], annot=True, fmt='.2%' , cmap="Blues")
    ax.set_xlabel('\n Predicted Values')
    ax.set_ylabel('Actual Values ')
    ax.set_title(L_titles[i])
    ax.xaxis.set_ticklabels(L_labels[k:k+2])
    ax.yaxis.set_ticklabels(L_labels[k:k+2])
    k += 2
    plt.show()

In [None]:
# Loading the best model for each fold, our final model will be the "average" of all of them
L_best_models = []
for i in range(nb_fold):
    path = "MODEL/best_model" + str(i+1) + ".hdf5"
    L_best_models.append(keras.models.load_model(path,
                                                 custom_objects={"TFRobertaModel": pretrained_model,
                                                                 "metric1": metric1,
                                                                 "metric2":metric2},
                                                 compile=False))

In [None]:
# Computing prediction of final model on test data

L_pred = [np.round(model.predict([X_ids_test, X_mask_test])) for model in L_best_models]
prediction = np.median(L_pred, axis=0) # prediction of final model on test data, that's where nb_fold needs to be odd

cm = ConfusionMatrix(Y_test , prediction)
f1 = f1_score(Y_test , prediction, average=None)
m1 = float(metric1(Y_test, prediction))
m2 = float(metric2(Y_test, prediction))
diff = np.abs(prediction-Y_test)
m3 = metric2_per_labels(diff)

In [None]:
# Summary of the model's performances on testing data
print(40*'-',f"SUMMARY OF MODEL PERFORMANCE ON TEST DATA",20*'-',"\n")

# metrics
print("metric1 Test mean :", m1, "\n")
print("metric2 Test mean :", m2, "\n")
print("metric2 Test mean per labels :", m3, "\n")

# F1 score
print("F1 score on positive/negative/neutral: ", f1)

# Confusion matrix
L_titles = [ 'Confusion Matrix for Negative label', 'Confusion Matrix for Neutral label','Confusion Matrix for Positive label']
L_labels = [ 'Not Negative', 'Negative', 'Not Neutral', 'Neutral','Not Positive', 'Positive']

k = 0
for i in range(3):
    plt.figure(figsize=(3,3))
    ax = sns.heatmap(cm[i], annot=True, fmt='.2%' , cmap="Blues")
    ax.set_xlabel('\n Predicted Values')
    ax.set_ylabel('Actual Values ')
    ax.set_title(L_titles[i])
    ax.xaxis.set_ticklabels(L_labels[k:k+2])
    ax.yaxis.set_ticklabels(L_labels[k:k+2])
    k += 2
    plt.show()

In [None]:
from sklearn.metrics import hamming_loss
print(f"Hamming loss : {hamming_loss(Y_test, prediction)}")

In [None]:
label_names = ['Negative', 'Neutral','Positive']
print(classification_report(Y_test, prediction, target_names=label_names))