In [None]:
!pip install transformers text-hammer pyreadstat

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup , AutoTokenizer, TFAutoModel, TFRobertaModel
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, f1_score
import matplotlib.pyplot as plt
import os
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm
from collections import defaultdict
import text_hammer as th
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import transformers
import random as rd
import keras.backend as K
from numpy.random import seed
from tensorflow.keras import layers
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import multilabel_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# identify and specify the GPU as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

### Functions

In [None]:
def metric2_tf(y_true, y_pred):
    differences = tf.abs(y_true - tf.round(y_pred))
    off_by_one = tf.reduce_sum(tf.cast(differences == 1, tf.float32))
    total_elements = tf.cast(tf.size(y_true), tf.float32)
    metric = 1 - (off_by_one / total_elements)
    return metric

def metric1_tf(y_true, y_pred):
    n = tf.cast(tf.shape(y_true)[0], tf.float32)
    y_pred_r = tf.round(y_pred)
    res = tf.reduce_all(tf.equal(y_true, y_pred_r), axis=1)
    res = tf.cast(res, tf.float32)
    return tf.reduce_sum(res) / n

In [None]:
def get_clean(x):
    mention = r'@\w+'
    hash = r'#\w+'
    x = str(x).lower().replace('\\', '').replace('_', '')
    x = re.sub(r'[^\x00-\x7F]+', ' ', x)
    x = th.cont_exp(x)
    x = th.remove_emails(x)
    x = th.remove_urls(x)
    x = re.sub(mention, ' ', x)
    x = re.sub(hash, ' ', x)
    x = th.remove_html_tags(x)
    x = th.remove_rt(x)
    x = th.remove_accented_chars(x)
    x = th.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(r'\s+', ' ', x).strip()
    x = re.sub(r'\w*\d+\w*', ' ', x).strip()
    return x

### Data

In [None]:
df = pd.read_spss("/content/drive/MyDrive/VA_EN_TU_2012-2020_3000_tweets_relevant_V03_labeled_1200_cleaned.sav")
data = df[['text', 'Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].copy()
data.head()

In [None]:
data['cleaned_text'] = data['text'].apply(get_clean)

### LSTM model

#### raw data

In [None]:
tweets = data['text'].values
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values

vocab_size = 10000
embedding_dim = 64
max_length = 70
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Tokenize and Pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(tweets)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(tweets)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_temp, X_test, y_temp, y_test = train_test_split(padded, labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[metric1_tf, metric2_tf])

model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))


In [None]:
loss, metric1_score,metric2_score = model.evaluate(X_val, y_val)
print(f"Validation Metric 1: {metric1_score*100:.2f}%")
print(f"Validation Metric 2: {metric2_score*100:.2f}%")

In [None]:
y_pred = model.predict(X_test)
y_test = y_test.astype(int)
threshold = 0.5
y_pred_thresholded = (y_pred > threshold).astype(int)
label_names = ['Negative', 'Neutral','Positive']


acc_test = accuracy_score(y_test,y_pred_thresholded)
metr_1_score = metric1_tf(y_test, y_pred_thresholded)
metr_2_score = metric2_tf(y_test, y_pred_thresholded)
print(classification_report(y_test, y_pred_thresholded, target_names=label_names))
print(f"Test Metric 2: {metr_2_score*100:.2f}%")
print(f"Test Metric 1: {metr_1_score*100:.2f}%")
print(f"Test Accuracy score: {acc_test*100:.2f}%")

In [None]:
from sklearn.metrics import hamming_loss
print(f"Hamming loss : {hamming_loss(y_test, y_pred_thresholded)}")

#### cleaned data

In [None]:
tweets = data['cleaned_text'].values
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values

vocab_size = 10000
embedding_dim = 64
max_length = 70
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Tokenize and Pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(tweets)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(tweets)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_temp, X_test, y_temp, y_test = train_test_split(padded, labels, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', metric2_tf])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))



In [None]:
# Evaluate the model
loss, accuracy,metric2_score = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy*100:.2f}%")
print(f"Validation Metric 2: {metric2_score*100:.2f}%")

In [None]:
y_pred = model.predict(X_test)
y_test = y_test.astype(int)
threshold = 0.5
y_pred_thresholded = (y_pred > threshold).astype(int)
label_names = ['Negative', 'Neutral', 'Positive']


acc_test = accuracy_score(y_test,y_pred_thresholded)
metr_2_score = metric2_tf(y_test, y_pred_thresholded)
print(classification_report(y_test, y_pred_thresholded, target_names=label_names))
print(f"Test Metric 2: {metr_2_score*100:.2f}%")
print(f"Test Accuracy score: {acc_test*100:.2f}%")

In [None]:
metr_1_score = metric1_tf(y_test, y_pred_thresholded)
print(f"Test Metric 1: {metr_1_score*100:.2f}%")

In [None]:
from sklearn.metrics import hamming_loss
print(f"Hamming loss : {hamming_loss(y_test, y_pred_thresholded)}")