In [None]:
!pip install transformers text-hammer pyreadstat

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup , AutoTokenizer, TFAutoModel
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, f1_score
import matplotlib.pyplot as plt
import os
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm
from collections import defaultdict
import text_hammer as th
import pandas as pd
import tensorflow as tf , keras
import transformers
import random as rd
import keras.backend as K
from numpy.random import seed
from tensorflow.keras import layers
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# identify and specify the GPU as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
def compute_validation_loss(model, validation_dataloader):
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    average_val_loss = total_val_loss / len(validation_dataloader)
    return average_val_loss

def metric2(y_true, y_pred_classes):
    n = y_true.size(0)
    # Calculate errors where prediction is off by 1 class
    res = torch.abs(y_true - y_pred_classes)
    count_error = torch.sum(res == 1, dtype=torch.float32)
    metric = 1 - count_error / n
    return metric.item()

def metric2_2(y_true, y_pred):
    # Convert to numpy arrays for easier manipulation
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    # Calculate the number of predictions off by 1 class
    off_by_one = np.sum(np.abs(y_true - y_pred) == 1)
    # Calculate the metric
    metric = 1 - off_by_one / len(y_true)
    return metric


In [None]:
def get_clean(x):
    mention = r'@\w+'
    hash = r'#\w+'
    x = str(x).lower().replace('\\', '').replace('_', '')
    x = re.sub(r'[^\x00-\x7F]+', ' ', x)
    x = th.cont_exp(x)
    x = th.remove_emails(x)
    x = th.remove_urls(x)
    x = re.sub(mention, ' ', x)
    x = re.sub(hash, ' ', x)
    x = th.remove_html_tags(x)
    x = th.remove_rt(x)
    x = th.remove_accented_chars(x)
    x = th.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(r'\s+', ' ', x).strip()
    x = re.sub(r'\w*\d+\w*', ' ', x).strip()
    return x

In [None]:
df = pd.read_spss("/content/drive/MyDrive/VA_EN_TU_2012-2020_3000_tweets_relevant_V03_labeled_1200_cleaned.sav")
# Original labels: 0 - positive, 1 - negative, 2 - neutral
# Remapping dictionary to align with RoBERTa's expected labels
label_mapping = {1: 2, 2: 0, 3: 1}

# Remapped labels: 0 - negative, 1 - neutral, 2 - positive
df['Label_B_emotion'] = df['Label_B_emotion'].replace(label_mapping)
df['Label_B_emotion'] = df['Label_B_emotion'].astype(int)
dff = df[['text','Label_B_emotion']].copy()
dff['Label_B_emotion'].unique()

In [None]:
dff['cleaned_data'] = dff['text'].apply(get_clean)

In [None]:
X = dff['text'].tolist()
y = dff['Label_B_emotion'].tolist()

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=70)

y_categorical = tf.keras.utils.to_categorical(y, num_classes=3)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y_categorical, test_size=0.2, random_state=42)

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kfold.split(X_train):

    X_kf_train, X_kf_val = X_train[train_index], X_train[val_index]
    y_kf_train, y_kf_val = y_train[train_index], y_train[val_index]

    # Build the LSTM model
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=70))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(f'Training for fold {fold_no} ...')
    model.fit(X_kf_train, y_kf_train, batch_size=64, epochs=5, validation_data=(X_kf_val, y_kf_val), verbose=1)

    scores = model.evaluate(X_kf_val, y_kf_val, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    model.save(f'/content/LSTM/model_fold_{fold_no}.h5')
    fold_no += 1


In [None]:
def ensemble_predictions(num_folds, X_test):
    predictions = []
    for fold in range(1, num_folds + 1):
        model = load_model(f'/content/LSTM/model_fold_{fold}.h5')
        pred = model.predict(X_test)
        predictions.append(pred)

    avg_pred = np.mean(predictions, axis=0)
    return np.argmax(avg_pred, axis=1)


In [None]:
ensemble_pred_classes = ensemble_predictions(5, X_test)

y_test_true_classes = np.argmax(y_test, axis=1)
ensemble_accuracy = accuracy_score(y_test_true_classes, ensemble_pred_classes)
print(f"Ensemble Accuracy: {ensemble_accuracy}")

ensemble_metric2_score = metric2_2(y_test_true_classes, ensemble_pred_classes)
print(f'Ensemble Metric2: {ensemble_metric2_score}')

ensemble_conf_matrix = confusion_matrix(y_test_true_classes, ensemble_pred_classes)
print(f'Ensemble Confusion Matrix:\n{ensemble_conf_matrix}')

In [None]:
print("Classification Report: \n", classification_report(y_test_true_classes, ensemble_pred_classes,target_names=['Negative','Neutral','Positive']))

#### Cleaned Text

In [None]:
X = dff['cleaned_data'].tolist()
y = dff['Label_B_emotion'].tolist()

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=70)

y_categorical = tf.keras.utils.to_categorical(y, num_classes=3)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y_categorical, test_size=0.2, random_state=42)

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
for train_index, val_index in kfold.split(X_train):

    X_kf_train, X_kf_val = X_train[train_index], X_train[val_index]
    y_kf_train, y_kf_val = y_train[train_index], y_train[val_index]

    # Build the LSTM model
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=70))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(f'Training for fold {fold_no} ...')
    model.fit(X_kf_train, y_kf_train, batch_size=64, epochs=5, validation_data=(X_kf_val, y_kf_val), verbose=1)

    scores = model.evaluate(X_kf_val, y_kf_val, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    model.save(f'/content/LSTM_clean/model_fold_{fold_no}.h5')
    fold_no += 1


In [None]:
def ensemble_predictions(num_folds, X_test):
    predictions = []
    for fold in range(1, num_folds + 1):
        model = load_model(f'/content/LSTM_clean/model_fold_{fold}.h5')
        pred = model.predict(X_test)
        predictions.append(pred)

    avg_pred = np.mean(predictions, axis=0)
    return np.argmax(avg_pred, axis=1)


In [None]:
ensemble_pred_classes = ensemble_predictions(5, X_test)

y_test_true_classes = np.argmax(y_test, axis=1)
ensemble_accuracy = accuracy_score(y_test_true_classes, ensemble_pred_classes)
print(f"Ensemble Accuracy: {ensemble_accuracy}")

ensemble_metric2_score = metric2_2(y_test_true_classes, ensemble_pred_classes)
print(f'Ensemble Metric2: {ensemble_metric2_score}')

ensemble_conf_matrix = confusion_matrix(y_test_true_classes, ensemble_pred_classes)
print(f'Ensemble Confusion Matrix:\n{ensemble_conf_matrix}')

In [None]:
print("Classification Report: \n", classification_report(y_test_true_classes, ensemble_pred_classes,target_names=['Negative','Neutral','Positive']))