In [None]:
!pip install transformers text-hammer pyreadstat

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup , AutoTokenizer, TFAutoModel, TFRobertaModel
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, f1_score
import matplotlib.pyplot as plt
import os
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn import svm
from collections import defaultdict
import text_hammer as th
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import transformers
import random as rd
import keras.backend as K
from numpy.random import seed
from tensorflow.keras import layers
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import multilabel_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# identify and specify the GPU as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

### Functions

In [None]:
def compute_validation_loss(model, validation_dataloader):
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    average_val_loss = total_val_loss / len(validation_dataloader)
    return average_val_loss

def metric2(y_true, y_pred_classes):
    n = y_true.size(0)
    # Calculate errors where prediction is off by 1 class
    res = torch.abs(y_true - y_pred_classes)
    count_error = torch.sum(res == 1, dtype=torch.float32)
    metric = 1 - count_error / n
    return metric.item()

def metric2_tf(y_true, y_pred):
    # Calculate the differences between true and predicted labels
    differences = np.abs(y_true - y_pred)
    # Count the number of predictions that are off by 1 across all samples and labels
    off_by_one = np.sum(differences == 1)
    # Calculate the metric as the proportion of predictions that are not off by one
    metric = 1 - (off_by_one / y_true.size)
    return metric


def metric2_2(y_true, y_pred):
    # Calculate the differences between true and predicted labels
    differences = np.abs(y_true - y_pred)
    # Count the number of predictions that are off by 1 for each sample
    off_by_one_per_sample = np.sum(differences == 1, axis=1)
    # Count the number of samples where at least one prediction is off by 1
    off_by_one = np.sum(off_by_one_per_sample > 0)
    # Calculate the metric as the proportion of samples that are not off by one
    metric = 1 - (off_by_one / len(y_true))
    return metric

def metric1_tf(y_true, y_pred):
    n = tf.cast(tf.shape(y_true)[0], tf.float32)
    y_pred_r = tf.round(y_pred)
    res = tf.reduce_all(tf.equal(y_true, y_pred_r), axis=1)
    res = tf.cast(res, tf.float32)
    return tf.reduce_sum(res) / n

In [None]:
def get_clean(x):
    mention = r'@\w+'
    hash = r'#\w+'
    x = str(x).lower().replace('\\', '').replace('_', '')
    x = re.sub(r'[^\x00-\x7F]+', ' ', x)
    x = th.cont_exp(x)
    x = th.remove_emails(x)
    x = th.remove_urls(x)
    x = re.sub(mention, ' ', x)
    x = re.sub(hash, ' ', x)
    x = th.remove_html_tags(x)
    x = th.remove_rt(x)
    x = th.remove_accented_chars(x)
    x = th.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    x = re.sub(r'\s+', ' ', x).strip()
    x = re.sub(r'\w*\d+\w*', ' ', x).strip()
    return x

### Data

In [None]:
df = pd.read_spss("/content/drive/MyDrive/VA_EN_TU_2012-2020_3000_tweets_relevant_V03_labeled_1200_cleaned.sav")
data = df[['text', 'Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].copy()
data.head()

In [None]:
data['cleaned_text'] = data['text'].apply(get_clean)

In [None]:
## Individual Label Distribution
label_sums = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].sum(axis=0)
labels = ['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']

# label_counts = data[['label_1_positive', 'label_2_negative', 'label_3_neutral']].sum()
# print(label_counts)

plt.figure(figsize=(10,6))
plt.bar(labels, label_sums)
plt.ylabel('Number of tweets')
plt.title('Individual Label Distribution')
plt.show()

## Label Combinations Distribution
df_train_labels = pd.DataFrame(data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']], columns=labels)
combinations = df_train_labels.groupby(labels).size().reset_index().rename(columns={0:'count'})


combinations = combinations.sort_values(by='count', ascending=False) # by count to get most frequent combinations
combinations['Label Combination'] = combinations[labels].astype(int).astype(str).agg(','.join, axis=1)

plt.figure(figsize=(12,8))
combinations.plot(x='Label Combination', y='count', kind='bar', legend=False)
plt.title('Label Combinations Distribution')
plt.ylabel('Number of tweets')
plt.xticks(rotation=45)
plt.show()


### SVM model

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score


texts = data['text'].tolist()
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values.astype(int)

# Convert labels to a multi-label format
y = np.array(labels)

# TF-IDF features with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
X_tfidf = vectorizer.fit_transform(texts)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# SVM model with class weights
svm_model = OneVsRestClassifier(SVC(kernel='linear', class_weight='balanced'))

# hyperparameter grid search
param_grid = {
    'estimator__C': [0.1, 1, 10],
}

grid_search = GridSearchCV(svm_model, param_grid, scoring='f1_micro', cv=3)
grid_search.fit(X_train, y_train)

best_svm_model = grid_search.best_estimator_

y_pred = best_svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
metric2_score = metric2_tf(y_test, y_pred)
metric1_score = metric1_tf(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Metric 2: {metric2_score}")
print(f"Metric 1: {metric1_score}")


In [None]:
label_names =['negative', 'neutral','positive']
print(classification_report(y_test, y_pred , target_names=label_names))

In [None]:
from sklearn.metrics import hamming_loss
print(f"Hamming loss : {hamming_loss(y_test, y_pred)}")

In [None]:
texts = data['cleaned_text'].tolist()
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values.astype(int)

# Convert labels to a multi-label format
y = np.array(labels)

# TF-IDF features with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
X_tfidf = vectorizer.fit_transform(texts)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

svm_model = OneVsRestClassifier(SVC(kernel='linear', class_weight='balanced'))

# hyperparameter grid search
param_grid = {
    'estimator__C': [0.1, 1, 10],
}

grid_search = GridSearchCV(svm_model, param_grid, scoring='f1_micro', cv=3)
grid_search.fit(X_train, y_train)


best_svm_model = grid_search.best_estimator_

y_pred = best_svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
metric2_score = metric2_tf(y_test, y_pred)
metric1_score = metric1_tf(y_test, y_pred)
# results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Metric 2: {metric2_score}")
print(f"Metric 1: {metric1_score}")


In [None]:
label_names =['negative', 'neutral','positive']
print(classification_report(y_test, y_pred , target_names=label_names))

#### Raw data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Sample data
texts = data['text'].tolist()
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values.astype(int)

# Convert labels to a multi-label format
y = np.array(labels)
# Create TF-IDF features with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
X_tfidf = vectorizer.fit_transform(texts)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a multi-label SVM model
svm_model = OneVsRestClassifier(SVC(kernel='linear'))
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')  # Micro-average is often used in multi-label classification
metric2_score = metric2_tf(y_test, y_pred)
metric1_score = metric1_tf(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Metric 2: {metric2_score}")
print(f"Metric 1: {metric1_score}")


In [None]:
label_names =['negative', 'neutral','positive']
print(classification_report(y_test, y_pred , target_names=label_names))

#### Cleaned data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Sample data
texts = data['cleaned_text'].tolist()
labels = data[['Label_A2_negative', 'Label_A3_neutral','Label_A1_positive']].values.astype(int)

# Convert labels to a multi-label format
y = np.array(labels)
# Create TF-IDF features with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=5000)
X_tfidf = vectorizer.fit_transform(texts)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a multi-label SVM model
svm_model = OneVsRestClassifier(SVC(kernel='linear'))
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')  # Micro-average is often used in multi-label classification
metric2_score = metric2_tf(y_test, y_pred)
metric1_score = metric1_tf(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Metric 2: {metric2_score}")
print(f"Metric 1: {metric1_score}")


In [None]:
label_names =['negative', 'neutral','positive']
print(classification_report(y_test, y_pred , target_names=label_names))