In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%bash
pip install torch
pip install datasets
pip install nltk

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 472.7/472.7 kB 30.0 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 11.6 MB/s eta 0:00:00
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 10.6 MB/s eta 0:00:00
Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 

In [3]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle
from nltk.tokenize import word_tokenize
import nltk
import os
import random

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
# Load dataset
dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

train, val, test = train_dataset.to_pandas(), validation_dataset.to_pandas(), test_dataset.to_pandas()
max_length = max(0, train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5

# Load pretrained embeddings
with open('drive/MyDrive/SC4002/embedding_matrix_new.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f)

# Load word-to-index mapping
with open('drive/MyDrive/SC4002/vocab_word_to_index_new.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

vocab_size, embedding_dim = embedding_matrix.shape
padding_embedding = np.zeros((50, embedding_dim))
embedding_matrix = np.vstack((embedding_matrix, padding_embedding))

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [6]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx):

        inputs = []
        lengths = []

        for text in texts:
            tokens = word_tokenize(text.lower())
            indexed_tokens = [word_to_idx.get(word, len(word_to_idx) - 1) for word in tokens]
            sequence_length = len(indexed_tokens)
            lengths.append(min(sequence_length, max_length))

            if sequence_length > max_length:
                indexed_tokens = indexed_tokens[:max_length]
            else:
                indexed_tokens += [len(word_to_idx)] * (max_length - sequence_length)  # Padding

            inputs.append(indexed_tokens)

        self.inputs = torch.tensor(inputs).to(device)
        self.labels = torch.tensor(labels).to(device)
        self.lengths = torch.tensor(lengths).to(device)

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self,idx):
        return self.inputs[idx], self.labels[idx], self.lengths[idx]

train_data = CustomDataset(train_dataset['text'], train_dataset['label'], word_to_idx)
val_data = CustomDataset(validation_dataset['text'], validation_dataset['label'], word_to_idx)
test_data = CustomDataset(test_dataset['text'], test_dataset['label'], word_to_idx)

def worker_init_fn(worker_id):
    np.random.seed(42 + worker_id)
    random.seed(42 + worker_id)

batch_size = 32
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size, worker_init_fn = worker_init_fn)
val_loader = DataLoader(val_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [7]:
# Define Model with biLSTM and biGRU options
class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, num_layers, output_size = 1, model_type = 'lstm'):
        super(SentimentModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype = torch.float32), freeze = False, padding_idx = len(word_to_idx))

        if model_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)
        elif model_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)

        self.fc = nn.Linear(hidden_size * 2, 1)  # 2 for bidirectional and 1 output class
        self.init_weights()

    def init_weights(self):
        # Initialize Embedding Layer
        nn.init.uniform_(self.embedding.weight, -0.01, 0.01)

        # Initialize RNN (LSTM/GRU) weights and biases
        for name, param in self.rnn.named_parameters():
            if 'weight_ih' in name:  # Input to hidden weights
                nn.init.xavier_uniform_(param.data)  # Xavier initialization
            elif 'weight_hh' in name:  # Hidden to hidden weights
                nn.init.orthogonal_(param.data)  # Orthogonal initialization
            elif 'bias' in name:
                nn.init.zeros_(param.data)  # Zero bias initialization

        # Initialize Linear (Fully connected) layer
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first = True, enforce_sorted = False)
        packed_rnn_out, _ = self.rnn(packed_embedded)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first = True)
        final_feature_map = rnn_out[torch.arange(rnn_out.size(0)), lengths - 1]
        final_out = self.fc(final_feature_map)
        return final_out

In [8]:
# Training loop
def train_model(model, train_loader, val_loader, epochs, lr):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr = lr)

    best_val_acc = 0
    patience = 5
    counter = 0

    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        scaler = torch.amp.GradScaler()

        for inputs, labels, lengths in train_loader:
            labels = labels.float().unsqueeze(1)
            optimizer.zero_grad()
            with torch.amp.autocast(device_type = 'cuda'):
                outputs = model(inputs, lengths)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            # loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 4.0)
            # optimizer.step()
            running_loss += loss.item()

        _, val_acc = evaluate_model(model, val_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {(running_loss / len(train_loader)):.4f}, Val Accuracy: {val_acc:.4f}')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            counter = 0
            # torch.save(model.state_dict(), 'best_model.pth')
        else:
            counter += 1

        if counter >= patience:
            print("Early stopping")
            break

    print('Training complete')

# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    all_predictions = []
    # all_logits = []
    with torch.no_grad():
        for inputs, labels, lengths in loader:
            # inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, lengths)
            # print(outputs)
            predictions = (outputs >= 0.5).float()
            # logits, predictions = torch.max(outputs, 1)
            all_predictions.extend(predictions.cpu().numpy())
            # all_logits.extend(logits.cpu().numpy())
            total += labels.size(0)
            correct += (predictions == labels.float().unsqueeze(1)).sum().item()

    accuracy = correct / total
    return all_predictions, accuracy

In [9]:
def get_predictions(model, loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for inputs, labels, lengths in loader:
            outputs = model(inputs, lengths)
            all_preds.append(outputs.cpu().numpy())
    return np.concatenate(all_preds)

def calculate_accuracy(predictions, labels):
    return np.mean(predictions == labels)

In [10]:
hidden_size = 64
num_layers = 3
learning_rate = 0.01
epochs = 30

print('Training biGRU Model')
biGRU_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'gru')
train_model(biGRU_model, train_loader, val_loader, epochs, learning_rate)

print('Training biLSTM Model')
biLSTM_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'lstm')
train_model(biLSTM_model, train_loader, val_loader, epochs, learning_rate)

Training biGRU Model
Epoch 1/30, Train Loss: 0.6198, Val Accuracy: 0.7036
Epoch 2/30, Train Loss: 0.3637, Val Accuracy: 0.7467
Epoch 3/30, Train Loss: 0.1879, Val Accuracy: 0.7364
Epoch 4/30, Train Loss: 0.1084, Val Accuracy: 0.7430
Epoch 5/30, Train Loss: 0.0720, Val Accuracy: 0.7308
Epoch 6/30, Train Loss: 0.0446, Val Accuracy: 0.7326
Epoch 7/30, Train Loss: 0.0340, Val Accuracy: 0.7242
Early stopping
Training complete
Training biLSTM Model
Epoch 1/30, Train Loss: 0.6440, Val Accuracy: 0.6567
Epoch 2/30, Train Loss: 0.4322, Val Accuracy: 0.7167
Epoch 3/30, Train Loss: 0.2628, Val Accuracy: 0.7345
Epoch 4/30, Train Loss: 0.1385, Val Accuracy: 0.7205
Epoch 5/30, Train Loss: 0.0837, Val Accuracy: 0.7280
Epoch 6/30, Train Loss: 0.0573, Val Accuracy: 0.7111
Epoch 7/30, Train Loss: 0.0412, Val Accuracy: 0.7298
Epoch 8/30, Train Loss: 0.0293, Val Accuracy: 0.7392
Epoch 9/30, Train Loss: 0.0224, Val Accuracy: 0.7317
Epoch 10/30, Train Loss: 0.0283, Val Accuracy: 0.7373
Epoch 11/30, Train Los

In [14]:
val_preds_biGRU = get_predictions(biGRU_model, val_loader)
val_preds_biLSTM = get_predictions(biLSTM_model, val_loader)

test_preds_biGRU = get_predictions(biGRU_model, test_loader)
test_preds_biLSTM = get_predictions(biLSTM_model, test_loader)

val_labels = []
for _, labels, _ in val_loader:
    val_labels.append(labels.cpu().numpy())
val_labels = np.concatenate(val_labels)

val_predictions_stack = np.column_stack((val_preds_biGRU, val_preds_biLSTM))

In [24]:
# Simple averaging of predictions
test_labels = []
for _, labels, _ in test_loader:
    test_labels.append(labels.cpu().numpy())
test_labels = np.concatenate(test_labels)

def get_final_predictions_binary(logits):
    probs = torch.sigmoid(torch.tensor(logits))  # Apply sigmoid to convert logits to probabilities
    return (probs > 0.5).int().numpy()

final_test_predictions = (test_preds_biGRU + test_preds_biLSTM) / 2
final_test_labels = get_final_predictions_binary(final_test_predictions)

test_accuracy = calculate_accuracy(final_test_labels, test_labels)
print(f'Ensemble Test Accuracy (Simple Averaging): {test_accuracy:.4f}')

Ensemble Test Accuracy (Simple Averaging): 0.5000


In [26]:
meta_model = LinearRegression()
meta_model.fit(val_predictions_stack, val_labels)

test_predictions_stack = np.column_stack((test_preds_biGRU, test_preds_biLSTM))
final_test_predictions = meta_model.predict(test_predictions_stack)
final_test_labels = np.round(final_test_predictions)

test_accuracy = calculate_accuracy(final_test_labels, test_labels)
print(f'Ensemble Test Accuracy (Weighted Averaging): {test_accuracy:.4f}')

Ensemble Test Accuracy (Weighted Averaging): 0.7561
