In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%bash
pip install torch
pip install datasets
pip install nltk

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 472.7/472.7 kB 6.0 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 8.3 MB/s eta 0:00:00
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 7.4 MB/s eta 0:00:00
Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle
from nltk.tokenize import word_tokenize
import nltk
import os
import random

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Load dataset
dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

train, val, test = train_dataset.to_pandas(), validation_dataset.to_pandas(), test_dataset.to_pandas()
max_length = max(0, train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5

# Load pretrained embeddings
with open('drive/MyDrive/SC4002/embedding_matrix_new.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f)

# Load word-to-index mapping
with open('drive/MyDrive/SC4002/vocab_word_to_index_new.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

vocab_size, embedding_dim = embedding_matrix.shape
padding_embedding = np.zeros((50, embedding_dim))
embedding_matrix = np.vstack((embedding_matrix, padding_embedding))

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx):

        inputs = []
        lengths = []

        for text in texts:
            tokens = word_tokenize(text.lower())
            indexed_tokens = [word_to_idx.get(word, len(word_to_idx) - 1) for word in tokens]
            sequence_length = len(indexed_tokens)
            lengths.append(min(sequence_length, max_length))

            if sequence_length > max_length:
                indexed_tokens = indexed_tokens[:max_length]
            else:
                indexed_tokens += [len(word_to_idx)] * (max_length - sequence_length)  # Padding

            inputs.append(indexed_tokens)

        self.inputs = torch.tensor(inputs).to(device)
        self.labels = torch.tensor(labels).to(device)
        self.lengths = torch.tensor(lengths).to(device)

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self,idx):
        return self.inputs[idx], self.labels[idx], self.lengths[idx]

train_data = CustomDataset(train_dataset['text'], train_dataset['label'], word_to_idx)
val_data = CustomDataset(validation_dataset['text'], validation_dataset['label'], word_to_idx)
test_data = CustomDataset(test_dataset['text'], test_dataset['label'], word_to_idx)

batch_size = 32
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(val_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [None]:
c = 0
for batch in train_loader:
    c += 1
    print(batch)
print(f'{c} batches')

[tensor([[ 5872,  5214, 13710,  ..., 16536, 16536, 16536],
        [12983,  6483,  1979,  ..., 16536, 16536, 16536],
        [13940,  4838, 11599,  ..., 16536, 16536, 16536],
        ...,
        [11444, 16535,   946,  ..., 16536, 16536, 16536],
        [ 8004,  4235,  7296,  ..., 16536, 16536, 16536],
        [ 8346, 11999, 10265,  ..., 16536, 16536, 16536]], device='cuda:0'), tensor([1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 1, 1, 0, 0], device='cuda:0'), tensor([12, 26, 14, 27, 33, 25, 22, 36, 21, 34, 18, 32, 11, 33, 30, 21, 16, 18,
        10, 14, 15, 21, 30, 46, 17, 34, 13, 25, 14, 20, 18, 40],
       device='cuda:0')]
[tensor([[16535, 10606,   989,  ..., 16536, 16536, 16536],
        [16533,  1865, 15303,  ..., 16536, 16536, 16536],
        [13220,  8264, 11552,  ..., 16536, 16536, 16536],
        ...,
        [ 1818, 13266, 15450,  ..., 16536, 16536, 16536],
        [ 9418, 12550, 12753,  ..., 16536, 16536, 16536],
        [ 9398

In [None]:
# Define Model with biLSTM and biGRU options
class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, num_layers, output_size = 1, model_type = 'lstm'):
        super(SentimentModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype = torch.float32), freeze = False, padding_idx = len(word_to_idx))

        if model_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)
        elif model_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)

        self.fc = nn.Linear(hidden_size * 2, 1)  # 2 for bidirectional and 1 output class
        # self.softmax = nn.Softmax(dim = 1)
        # self.sigmoid = nn.Sigmoid()
        self.init_weights()

    def init_weights(self):
        # Initialize Embedding Layer
        nn.init.uniform_(self.embedding.weight, -0.01, 0.01)

        # Initialize RNN (LSTM/GRU) weights and biases
        for name, param in self.rnn.named_parameters():
            if 'weight_ih' in name:  # Input to hidden weights
                nn.init.xavier_uniform_(param.data)  # Xavier initialization
            elif 'weight_hh' in name:  # Hidden to hidden weights
                nn.init.orthogonal_(param.data)  # Orthogonal initialization
            elif 'bias' in name:
                nn.init.zeros_(param.data)  # Zero bias initialization

        # Initialize Linear (Fully connected) layer
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first = True, enforce_sorted = False)
        packed_rnn_out, _ = self.rnn(packed_embedded)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first = True)
        final_feature_map = rnn_out[torch.arange(rnn_out.size(0)), lengths - 1]
        final_out = self.fc(final_feature_map)
        return final_out

In [None]:
# Training loop
def train_model(model, train_loader, val_loader, epochs, lr):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr = lr)

    best_val_acc = 0

    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        scaler = torch.amp.GradScaler()

        for inputs, labels, lengths in train_loader:
            labels = labels.float().unsqueeze(1)
            optimizer.zero_grad()
            with torch.amp.autocast(device_type = 'cuda'):
                outputs = model(inputs, lengths)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 4.0)
            running_loss += loss.item()

        _, val_acc = evaluate_model(model, val_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {(running_loss / len(train_loader)):.4f}, Val Accuracy: {val_acc:.4f}')

    print('Training complete')

# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    all_predictions = []

    with torch.no_grad():
        for inputs, labels, lengths in loader:
            outputs = model(inputs, lengths)
            predictions = (outputs >= 0.5).float()
            all_predictions.extend(predictions.cpu().numpy())
            total += labels.size(0)
            correct += (predictions == labels.float().unsqueeze(1)).sum().item()

    accuracy = correct / total
    return all_predictions, accuracy

In [None]:
# biGRU

hidden_size = 64
num_layers = 3
learning_rate = 0.01
epochs = 10

print('Training biGRU Model')
biGRU_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'gru')
train_model(biGRU_model, train_loader, val_loader, epochs, learning_rate)

print('Evaluating biGRU Model on Test Set')
predictions_biGRU, test_accuracy_biGRU = evaluate_model(biGRU_model, test_loader)
print(f'Test Accuracy biGRU: {test_accuracy_biGRU:.4f}')

Training biGRU Model
Epoch 1/10, Train Loss: 0.5942, Val Accuracy: 0.7092
Epoch 2/10, Train Loss: 0.2865, Val Accuracy: 0.7448
Epoch 3/10, Train Loss: 0.1253, Val Accuracy: 0.7495
Epoch 4/10, Train Loss: 0.0773, Val Accuracy: 0.7402
Epoch 5/10, Train Loss: 0.0448, Val Accuracy: 0.7439
Epoch 6/10, Train Loss: 0.0319, Val Accuracy: 0.7477
Epoch 7/10, Train Loss: 0.0277, Val Accuracy: 0.7317
Epoch 8/10, Train Loss: 0.0186, Val Accuracy: 0.7308
Epoch 9/10, Train Loss: 0.0209, Val Accuracy: 0.7430
Epoch 10/10, Train Loss: 0.0197, Val Accuracy: 0.7448
Training complete
Evaluating biGRU Model on Test Set
Test Accuracy biGRU: 0.7486


In [None]:
# biLSTM

hidden_size = 64
num_layers = 3
learning_rate = 0.01
epochs = 10

print('Training biLSTM Model')
biLSTM_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'lstm')
train_model(biLSTM_model, train_loader, val_loader, epochs, learning_rate)

print('Evaluating biLSTM Model on Test Set')
predictions_biLSTM, test_accuracy_biLSTM = evaluate_model(biLSTM_model, test_loader)
print(f'Test Accuracy biLSTM: {test_accuracy_biLSTM:.4f}')

Training biLSTM Model
Epoch 1/10, Train Loss: 0.6719, Val Accuracy: 0.6679
Epoch 2/10, Train Loss: 0.4695, Val Accuracy: 0.6904
Epoch 3/10, Train Loss: 0.3066, Val Accuracy: 0.7345
Epoch 4/10, Train Loss: 0.2029, Val Accuracy: 0.7392
Epoch 5/10, Train Loss: 0.1426, Val Accuracy: 0.7439
Epoch 6/10, Train Loss: 0.1043, Val Accuracy: 0.7308
Epoch 7/10, Train Loss: 0.0889, Val Accuracy: 0.7233
Epoch 8/10, Train Loss: 0.0588, Val Accuracy: 0.7223
Epoch 9/10, Train Loss: 0.0432, Val Accuracy: 0.7308
Epoch 10/10, Train Loss: 0.0288, Val Accuracy: 0.7148
Training complete
Evaluating biLSTM Model on Test Set
Test Accuracy biLSTM: 0.7589
