In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary libraries

# %%bash
# pip install torch
# pip install datasets
# pip install nltk

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 480.6/480.6 kB 26.1 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 12.0 MB/s eta 0:00:00
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 179.3/179.3 kB 18.5 MB/s eta 0:00:00
Do

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.


In [None]:
# Import requisite libraries

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle
from nltk.tokenize import word_tokenize
import nltk
import os
import random

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # for debugging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Set seed

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# Load augmented train dataset

train = pd.read_csv('../augmented_combined_train_dataset.csv')
train

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
12930,"Snoots will no doubt rally to his cause, stepp...",0
12931,"[The film's] taste for ""shock humor"" will fade...",0
12932,A portrait of a young woman's depression is no...,0
12933,"endlessly dreary, not to mention boredom.",0


In [None]:
# Load validation and test datasets
dataset = load_dataset('rotten_tomatoes')
validation_dataset = dataset['validation']
test_dataset = dataset['test']

val, test = validation_dataset.to_pandas(), test_dataset.to_pandas()
max_length = max(0, train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5

# Load pretrained embeddings
with open('../augmented_embedding_matrix_50d.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f)

# Load word-to-index mapping
with open('../augmented_vocab_word_to_index_50d.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

vocab_size, embedding_dim = embedding_matrix.shape
padding_embedding = np.zeros((50, embedding_dim))
embedding_matrix = np.vstack((embedding_matrix, padding_embedding))

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
# Dataset tokenisation and wrapping as an iterable for easy access

class CustomDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx):

        inputs = []
        lengths = []

        for text in texts:
            tokens = word_tokenize(text.lower())
            indexed_tokens = [word_to_idx.get(word, len(word_to_idx) - 1) for word in tokens]
            sequence_length = len(indexed_tokens)
            lengths.append(min(sequence_length, max_length))

            if sequence_length > max_length:
                indexed_tokens = indexed_tokens[:max_length]
            else:
                indexed_tokens += [len(word_to_idx)] * (max_length - sequence_length)  # Padding

            inputs.append(indexed_tokens)

        self.inputs = torch.tensor(inputs).to(device)
        self.labels = torch.tensor(labels.tolist()).to(device)
        self.lengths = torch.tensor(lengths).to(device)

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self,idx):
        return self.inputs[idx], self.labels[idx], self.lengths[idx]

train_data = CustomDataset(train['text'], train['label'], word_to_idx)
val_data = CustomDataset(val['text'], val['label'], word_to_idx)
test_data = CustomDataset(test['text'], test['label'], word_to_idx)

def worker_init_fn(worker_id):
    np.random.seed(42 + worker_id)
    random.seed(42 + worker_id)

batch_size = 128
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size, worker_init_fn = worker_init_fn)
val_loader = DataLoader(val_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [None]:
# Define Model with biLSTM and biGRU options

class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, num_layers, output_size = 1, model_type = 'lstm'):
        super(SentimentModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype = torch.float32), freeze = False, padding_idx = len(word_to_idx))

        if model_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)
        elif model_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)

        self.fc = nn.Linear(hidden_size * 2, 1)  # 2 for bidirectional and 1 output class
        self.init_weights()

    def init_weights(self):
        # Initialize Embedding Layer
        nn.init.uniform_(self.embedding.weight, -0.01, 0.01)

        # Initialize RNN (LSTM/GRU) weights and biases
        for name, param in self.rnn.named_parameters():
            if 'weight_ih' in name:  # Input to hidden weights
                nn.init.xavier_uniform_(param.data)  # Xavier initialization
            elif 'weight_hh' in name:  # Hidden to hidden weights
                nn.init.orthogonal_(param.data)  # Orthogonal initialization
            elif 'bias' in name:
                nn.init.zeros_(param.data)  # Zero bias initialization

        # Initialize Linear (Fully connected) layer
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first = True, enforce_sorted = False)
        packed_rnn_out, _ = self.rnn(packed_embedded)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first = True)
        final_feature_map = rnn_out[torch.arange(rnn_out.size(0)), lengths - 1]
        final_out = self.fc(final_feature_map)
        return final_out

In [1]:
# Training loop
# This section has been commented as training is not required if you wish to evaluate the presaved checkpoint for convenience.
# Please uncomment if you wish to train the model again.

# def train_model(model, train_loader, val_loader, epochs, lr, model_type):
#     criterion = nn.BCEWithLogitsLoss()
#     optimizer = optim.Adam(model.parameters(), lr = lr)

#     best_val_acc = 0
#     patience = 5
#     counter = 0

#     model = model.to(device)

#     for epoch in range(epochs):
#         model.train()
#         running_loss = 0.0

#         for inputs, labels, lengths in train_loader:
#             labels = labels.float().unsqueeze(1)
#             optimizer.zero_grad()
#             outputs = model(inputs, lengths)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 4.0)
#             optimizer.step()
#             running_loss += loss.item()

#         _, val_acc = evaluate_model(model, val_loader)
#         print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {(running_loss / len(train_loader)):.4f}, Val Accuracy: {val_acc:.4f}')

#         # Training until validation accuracy is stable (stops increasing)
#         if val_acc > best_val_acc:
#             best_val_acc = val_acc
#             counter = 0
#             torch.save(model.state_dict(), f'./model checkpoints - augmented dataset/best_{model_type}.pth')
#         else:
#             counter += 1

#         if counter >= patience:
#             print("Early stopping")
#             break

#     print('Training complete')

# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    all_predictions = []
    with torch.no_grad():
        for inputs, labels, lengths in loader:
            outputs = model(inputs, lengths)
            predictions = (outputs >= 0.5).float()
            all_predictions.extend(predictions.cpu().numpy())
            total += labels.size(0)
            correct += (predictions == labels.float().unsqueeze(1)).sum().item()

    accuracy = correct / total
    return all_predictions, accuracy

In [None]:
# biGRU training and evaluation
# The training portion is commented as the presaved checkpoint can be loaded and evaluated for convenience.
# Please uncomment to train again.

hidden_size = 64
num_layers = 3
learning_rate = 0.01
epochs = 30

# print('Training biGRU Model')
biGRU_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'gru')
# train_model(biGRU_model, train_loader, val_loader, epochs, learning_rate, model_type = 'biGRU')

best_biGRU_path = './model checkpoints - augmented dataset/best_biGRU.pth' # Modify path according to device
biGRU_model.load_state_dict(torch.load(best_biGRU_path))
biGRU_model = biGRU_model.to(device)

print('Evaluating biGRU Model on Test Set')
predictions_biGRU, test_accuracy_biGRU = evaluate_model(biGRU_model, test_loader)
print(f'Test Accuracy biGRU: {test_accuracy_biGRU:.4f}')

Training biGRU Model
Epoch 1/30, Train Loss: 0.5442, Val Accuracy: 0.7448
Epoch 2/30, Train Loss: 0.1972, Val Accuracy: 0.7533
Epoch 3/30, Train Loss: 0.0660, Val Accuracy: 0.7420
Epoch 4/30, Train Loss: 0.0340, Val Accuracy: 0.7552
Epoch 5/30, Train Loss: 0.0205, Val Accuracy: 0.7439
Epoch 6/30, Train Loss: 0.0121, Val Accuracy: 0.7392
Epoch 7/30, Train Loss: 0.0176, Val Accuracy: 0.7495
Epoch 8/30, Train Loss: 0.0171, Val Accuracy: 0.7392
Epoch 9/30, Train Loss: 0.0118, Val Accuracy: 0.7439
Early stopping
Training complete
Evaluating biGRU Model on Test Set
Test Accuracy biGRU: 0.7786


  biGRU_model.load_state_dict(torch.load(best_biGRU_path))


In [None]:
# biLSTM training and evaluation

hidden_size = 64
num_layers = 3
learning_rate = 0.01
epochs = 30

# print('Training biLSTM Model')
biLSTM_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'lstm')
# train_model(biLSTM_model, train_loader, val_loader, epochs, learning_rate, model_type = 'biLSTM')

# Comment the above code annd run the following to load best model checkpoint and obtain test accuracy
best_biLSTM_path = './model checkpoints - augmented dataset/best_biLSTM.pth' # Modify path according to device
biLSTM_model.load_state_dict(torch.load(best_biLSTM_path))
biLSTM_model = biLSTM_model.to(device)

print('Evaluating biLSTM Model on Test Set')
predictions_biLSTM, test_accuracy_biLSTM = evaluate_model(biLSTM_model, test_loader)
print(f'Test Accuracy biLSTM: {test_accuracy_biLSTM:.4f}')

Training biLSTM Model
Epoch 1/30, Train Loss: 0.6255, Val Accuracy: 0.7261
Epoch 2/30, Train Loss: 0.3170, Val Accuracy: 0.7392
Epoch 3/30, Train Loss: 0.1504, Val Accuracy: 0.7448
Epoch 4/30, Train Loss: 0.0846, Val Accuracy: 0.7242
Epoch 5/30, Train Loss: 0.0538, Val Accuracy: 0.7120
Epoch 6/30, Train Loss: 0.0506, Val Accuracy: 0.7176
Epoch 7/30, Train Loss: 0.0294, Val Accuracy: 0.7308
Epoch 8/30, Train Loss: 0.0337, Val Accuracy: 0.7176
Early stopping
Training complete
Evaluating biLSTM Model on Test Set
Test Accuracy biLSTM: 0.7692


  biLSTM_model.load_state_dict(torch.load(best_biLSTM_path))


In [None]:
# class EnsembleModel(nn.Module):
#     def __init__(self, models):
#         super(EnsembleModel, self).__init__()
#         self.models = nn.ModuleList(models)

#     def forward(self, x, *args, **kwargs):
#         model_outputs = [model(x, *args, **kwargs) for model in self.models]
#         averaged_output = torch.mean(torch.stack(model_outputs), dim = 0)

#         return averaged_output

In [None]:
# Ensembling

# hidden_size = 64
# num_layers = 3
# learning_rate = 0.01
# epochs = 30

# biLSTM_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'lstm')
# biGRU_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'gru')

# ensemble_model = EnsembleModel([biLSTM_model, biGRU_model])

# print("Training Ensemble Model")
# train_model(ensemble_model, train_loader, val_loader, epochs, learning_rate, model_type = 'ensemble')

# best_ensemble_path = 'drive/MyDrive/SC4002/augmented_128/best_ensemble.pth'
# ensemble_model.load_state_dict(torch.load(best_ensemble_path))
# ensemble_model = ensemble_model.to(device)

# print("Evaluating Ensemble Model on Test Set")
# test_accuracy_ensemble = evaluate_model(ensemble_model, test_loader)
# print(f'Test Accuracy Ensemble: {test_accuracy_ensemble[-1]:.4f}')