In [112]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_sequence
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score




#TRAIN LOADER


data = pd.read_csv(r'..\OLID_Tain_ATUSER_URL_EmojiRemoved_Pedro.txt', sep='\t', names=['id','sentence', 'label1','label2', 'label3'])  #Francesco esecution
data = data.drop(axis = 1, labels = ['id','label2','label3'])
data = data.drop(axis=0, index = 0 )

# checking how many pos and neg reviews
review_counts = data['label1'].value_counts()
print(f'Count of reviews by sentiment: {review_counts}')

print()
for index, row in data.iterrows():
    if row['label1'] == 'OFF':
        data.at[index, 'label1'] = 0
    else:
        data.at[index, 'label1'] = 1

# checking how many pos and neg reviews
review_counts = data['label1'].value_counts()
print(f'Count of reviews by sentiment: {review_counts}')
        


nltk.download('punkt')
data['tokens'] = data['sentence'].apply(word_tokenize)


# create a vocabulary and map tokens to indices
vocab = {word: idx for idx, word in enumerate(set(word for sentence in data['tokens'] for word in sentence), 1)}
data['indexed'] = data['tokens'].apply(lambda x: [vocab[word] for word in x])


# pad sequences to a maximum length
max_len = max(len(sentence) for sentence in data['indexed'])
data['padded'] = data['indexed'].apply(lambda x: x + [0]*(max_len - len(x)))

# convert to tensord and split dataset
features = torch.tensor(data['padded'].tolist())
labels = torch.tensor(data['label1'].tolist())


train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)

print(torch.count_nonzero(test_labels))



train_data = TensorDataset(train_features, train_labels)
test_data = TensorDataset(test_features, test_labels)
train_loader = DataLoader(train_data, batch_size=32,shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


Count of reviews by sentiment: label1
NOT    8840
OFF    4400
Name: count, dtype: int64

Count of reviews by sentiment: label1
1    8840
0    4400
Name: count, dtype: int64


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\39393\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tensor(1774)


In [127]:
max_token_value = max(len(tokens) for tokens in data['indexed'])



class LSTM(nn.Module):
    def __init__(self, hidden_size=100, num_layers=1):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=max_token_value, hidden_size=hidden_size, num_layers=num_layers, bidirectional=False)
        self.l_out = nn.Linear(in_features=hidden_size, out_features=2, bias=False)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = x.view(-1, self.lstm.hidden_size)
        x = self.l_out(x)
        return x

# Initialise our network
model = LSTM()
print(model)

# Store training and validation loss
training_loss, validation_loss = [], []

class_weights = torch.tensor([1.0, 2])  # Adjust imbalance_ratio based on the actual class distribution
criterion = nn.BCEWithLogitsLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
# A way to get learning rate decay
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.9)


LSTM(
  (lstm): LSTM(109, 100)
  (l_out): Linear(in_features=100, out_features=2, bias=False)
)


In [128]:
import matplotlib.pyplot as plt
%matplotlib inline

epochs = 10

# Training loop
for epoch in range(epochs) : 

    train_loss = 0.0
    t_loss = 0.0
    for batch_nr, (inputs, labels) in enumerate(train_loader):
        #print(labels)
        images = inputs.float()
        labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=2).float()

        # Forward pass
        prediction = model(inputs.float())
        prediction = torch.sigmoid(prediction)

        # Compute loss
        loss = criterion(prediction, labels_one_hot)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        t_loss += loss.item()
        
    # Calculate average losses
    train_loss = t_loss / len(train_loader.dataset)
        
    # Update the learning rate scheduler
    scheduler.step()
        
    # Store the training and validation losses
    training_loss.append(train_loss)
        
    # Print the epoch statistics
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}')


#test
correct = 0
total = 0
predicted = []
actual = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(torch.float)  # Convert input to float type
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)  # Correct variable naming here

        # Convert Tensor to list and then extend
        predicted.extend(preds.tolist())
        actual.extend(labels.tolist())

print('Test Accuracy:', accuracy_score(actual, predicted)*100)
print('Classification Report:\n', classification_report(actual, predicted))
print('Confusion Matrix:\n', confusion_matrix(actual, predicted))
try:
    print('ROC AUC Score:', roc_auc_score(actual, predicted))
except ValueError as e:
    print(e)

Epoch 1/10, Train Loss: 0.030998
Epoch 2/10, Train Loss: 0.030717
Epoch 3/10, Train Loss: 0.030706
Epoch 4/10, Train Loss: 0.030695
Epoch 5/10, Train Loss: 0.030693
Epoch 6/10, Train Loss: 0.030688
Epoch 7/10, Train Loss: 0.030683
Epoch 8/10, Train Loss: 0.030679
Epoch 9/10, Train Loss: 0.030702
Epoch 10/10, Train Loss: 0.030707
Test Accuracy: 66.99395770392749
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       874
           1       0.67      1.00      0.80      1774

    accuracy                           0.67      2648
   macro avg       0.33      0.50      0.40      2648
weighted avg       0.45      0.67      0.54      2648

Confusion Matrix:
 [[   0  874]
 [   0 1774]]
ROC AUC Score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
