In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_sequence
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
olid_path = r'..\\OLID_Tain_ATUSER_URL_EmojiRemoved_Pedro.txt'
hasoc_path = r'.\\additional_datasets\\HASOCData'
oeval_path = r'.\\additional_datasets\\OffenseEval'

In [None]:
# OLID dataset preprocessed
def load_dataset_1():
    data = pd.read_csv(olid_path, sep='\t', names=['id','sentence', 'label1','label2', 'label3'])  #Francesco esecution 
    data = data.drop(axis = 1, labels = ['id','label2','label3'])
    data = data.drop(axis=0, index=0 )

    review_counts = data['label1'].value_counts()
    print(f'Count of reviews by sentiment: {review_counts}')

    for index, row in data.iterrows():
        if row['label1'] == 'OFF':
            data.at[index, 'label1'] = 0 
        else:
            data.at[index, 'label1'] = 1

    review_counts = data['label1'].value_counts()
    print(f'Count of reviews by sentiment: {review_counts}')
    nltk.download('punkt')
    data['tokens'] = data['sentence'].apply(word_tokenize)
    vocab = {word: idx for idx, word in enumerate(set(word for sentence in data['tokens'] for word in sentence), 1)}
    data['indexed'] = data['tokens'].apply(lambda x: [vocab[word] for word in x])
    max_len = max(len(sentence) for sentence in data['indexed'])
    data['padded'] = data['indexed'].apply(lambda x: x + [0]*(max_len - len(x)))
    features = torch.tensor(data['padded'].tolist())
    labels = torch.tensor(data['label1'].tolist())

    training_features, test_features, training_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

    training_data = TensorDataset(training_features, training_labels)
    test_data = TensorDataset(test_features, test_labels)
    training_loader = DataLoader(training_data, batch_size=10)
    test_loader = DataLoader(test_data, batch_size=10)

    train_features, val_features, train_labels, val_labels = train_test_split(training_features, training_labels, test_size=0.2)
    train_data = TensorDataset(train_features, train_labels)
    val_data = TensorDataset(val_features, val_labels)
    train_loader = DataLoader(train_data, batch_size=10)
    val_loader = DataLoader(val_data, batch_size= 10)

    return vocab, train_loader, val_loader, test_loader


In [None]:
# HASOC dataset
def load_dataset_2():
    pass

In [None]:
# OffenceEval dataset
def load_dataset_3():
    pass

In [None]:
# Model (using self-attention)
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(SelfAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.projection = nn.Sequential(
            nn.Linear(hidden_dim, 32),
            nn.ReLU(True),
            nn.Linear(32, 1)
        )

    def forward(self, encoder_outputs):
       
        energy = self.projection(encoder_outputs)  # [batch_size, seq_len, 1]
        weights = torch.softmax(energy.squeeze(-1), dim=1)  # [batch_size, seq_len]
        outputs = (encoder_outputs * weights.unsqueeze(-1)).sum(dim=1)  # [batch_size, hidden_dim * 2]
        return outputs

class BiLSTMWithSelfAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super(BiLSTMWithSelfAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.attention = SelfAttention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        # Apply self-attention
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        attended_output = self.attention(output)
        output = self.fc(attended_output)
        return output


In [None]:
# Hyperparameters
embedding_dim = 50
hidden_dim = 128
output_dim = 2 
global_epochs=2

In [None]:
def train_model(path, vocab, train_loader, val_loader, epochs=5):
    vocab_size = len(vocab) + 1  # plus one for padding index
    model = BiLSTMWithSelfAttention(vocab_size, embedding_dim, hidden_dim, output_dim)
    optimizer = optim.Adam(model.parameters(), lr=3e-3, weight_decay=1e-4)
    loss_fn = nn.CrossEntropyLoss()
    model.to(device)

    for epoch in range(epochs):
        train_loss = 0.0
        avg_train_loss = 0.0
        model.train()
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(texts)
            loss = loss_fn(predictions, labels)
            train_loss += loss.item() * texts.size(0)  # Accumulate the loss
            loss.backward()
            optimizer.step()
        
        avg_train_loss = train_loss / len(train_loader.dataset)

        val_loss = 0.0
        avg_val_loss = 0.0
        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                predictions = model(texts)
                loss = loss_fn(predictions, labels)
                val_loss += loss.item() * texts.size(0)  # Accumulate the loss

        avg_val_loss = val_loss/len(val_loader.dataset)

        torch.save(model, path)

        print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')

    return model

In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt

def evaluate_model(path, test_loader):
    model = torch.load(path)
    model.eval()
    all_labels = []
    all_preds = []
    all_probs = []  # List to store probabilities for the positive class

    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())
            # Using softmax to calculate probabilities
            softmax = torch.nn.Softmax(dim=1)
            probs = softmax(outputs).cpu().numpy()[:, 1]
            all_probs.extend(probs)

    # Calculate metrics
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))
    print("Confusion Matrix")
    print(confusion_matrix(all_labels, all_preds))

    # ROC AUC and ROC Curve are applicable only for binary classification
    if output_dim == 2:
        roc_auc = roc_auc_score(all_labels, all_probs)
        fpr, tpr, thresholds = roc_curve(all_labels, all_probs)

        print("ROC AUC Score:")
        print(roc_auc)

        # Plot ROC curve
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()

# Running the different datasets

In [None]:
# Test set 1, OLID
path_ds1 = "./models/model_olid.pth"
v1, tr1, vl1, te1 = load_dataset_1()
model = train_model(path_ds1, v1, train_loader=tr1, val_loader=vl1, epochs=global_epochs)
evaluate_model(path=path_ds1, test_loader=te1)

In [None]:
# Test set 2, HASOC
path_ds2 = "./models/model_hasoc.pth"
v2, tr2, vl2, te2 = load_dataset_1()
model = train_model(path_ds2, v2, train_loader=tr2, val_loader=vl2, epochs=global_epochs)
evaluate_model(path=path_ds2, test_loader=te2)

In [None]:
# Test set 3, OffenceEval
path_ds3 = "./models/model_offeval.pth"
v3, tr3, vl3, te3 = load_dataset_1()
model = train_model(path_ds3, v3, train_loader=tr3, val_loader=vl3, epochs=global_epochs)
evaluate_model(path=path_ds3, test_loader=te3)