In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%bash
pip install torch
pip install datasets
pip install nltk

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 471.6/471.6 kB 13.6 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 12.0 MB/s eta 0:00:00
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
   ━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle
from nltk.tokenize import word_tokenize
import nltk
import os

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# Load dataset
dataset = load_dataset('rotten_tomatoes')
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Load pretrained embeddings
with open('drive/MyDrive/SC4002/embedding_matrix.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f)

# Load word-to-index mapping
with open('drive/MyDrive/SC4002/vocab_word_to_index.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

vocab_size, embedding_dim = embedding_matrix.shape
padding_embedding = np.zeros((50, embedding_dim))
embedding_matrix = np.vstack((embedding_matrix, padding_embedding))

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [5]:
print(len(embedding_matrix))
print(len(word_to_idx))

17892
17842


In [6]:
list(word_to_idx.keys())[-1]

'<UNK>'

In [7]:
# Tokenize data using NLTK
# def tokenize_data(dataset, word_to_idx, max_length = 100):
#     tokenized_texts = []
#     labels = []

#     for text in dataset['text']:
#         tokens = word_tokenize(text.lower())
#         indexed_tokens = [word_to_idx.get(word, len(word_to_idx) - 1) for word in tokens]
#         if len(indexed_tokens) > max_length:
#             indexed_tokens = indexed_tokens[:max_length]
#         else:
#             indexed_tokens += [len(word_to_idx)] * (max_length - len(indexed_tokens))  # Padding

#         tokenized_texts.append(indexed_tokens)

#     labels = dataset['label']
#     return torch.tensor(tokenized_texts), torch.tensor(labels)

# train = train_dataset.to_pandas()
# max_seq_len = max(0, train['text'].apply(lambda x: len(x)).max())
# train_inputs, train_labels = tokenize_data(train_dataset, word_to_idx, max_seq_len)
# val_inputs, val_labels = tokenize_data(validation_dataset, word_to_idx, max_seq_len)
# test_inputs, test_labels = tokenize_data(test_dataset, word_to_idx, max_seq_len)

train, val, test = train_dataset.to_pandas(), validation_dataset.to_pandas(), test_dataset.to_pandas()
max_length = max(0, train['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, val['text'].apply(lambda x: len(x)).max())
max_length = max(max_length, test['text'].apply(lambda x: len(x)).max())
max_length += 5

class CustomDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx):

        inputs = []

        for text in texts:
            indexed_tokens = [word_to_idx.get(word, len(word_to_idx) - 1) for word in text]
            if len(indexed_tokens) > max_length:
                indexed_tokens = indexed_tokens[:max_length]
            else:
                indexed_tokens += [len(word_to_idx)] * (max_length - len(indexed_tokens))  # Padding

            inputs.append(indexed_tokens)

        self.inputs = torch.tensor(inputs).to(device)
        self.labels = torch.tensor(labels).to(device)

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self,idx):
        return self.inputs[idx], self.labels[idx]

# train_data = TensorDataset(train_inputs, train_labels)
# val_data = TensorDataset(val_inputs, val_labels)
# test_data = TensorDataset(test_inputs, test_labels)

# train_data = CustomDataset(train_inputs, train_labels)
# val_data = CustomDataset(val_inputs, val_labels)
# test_data = CustomDataset(test_inputs, test_labels)

train_data = CustomDataset(train_dataset['text'], train_dataset['label'], word_to_idx)
val_data = CustomDataset(validation_dataset['text'], validation_dataset['label'], word_to_idx)
test_data = CustomDataset(test_dataset['text'], test_dataset['label'], word_to_idx)

batch_size = 32
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)
val_loader = DataLoader(val_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [13]:
# Define Model with biLSTM and biGRU options
class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, num_layers, model_type = 'lstm'):
        super(SentimentModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype = torch.float32), freeze = False, padding_idx = len(word_to_idx))

        if model_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)
        elif model_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)

        self.fc = nn.Linear(hidden_size * 2, 2)  # 2 for bidirectional and 2 output classes
        # self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        final_feature_map = rnn_out[:, -1, :]
        final_out = self.fc(final_feature_map)
        return final_out

In [14]:
# Training loop
def train_model(model, train_loader, val_loader, epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = lr)

    best_val_acc = 0
    # patience = 3
    # counter = 0

    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_acc = evaluate_model(model, val_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {(running_loss / len(train_loader)):.4f}, Val Accuracy: {val_acc:.4f}')

        # if val_acc > best_val_acc:
        #     best_val_acc = val_acc
        #     counter = 0
        # else:
        #     counter += 1

        # if counter >= patience:
        #     print('Early stopping')
        #     break

    print('Training complete')

# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in loader:
            # inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [18]:
# Initialize and train the models
hidden_size = 128
num_layers = 2
learning_rate = 0.0002
epochs = 10

# Train biLSTM
print('Training biLSTM Model')
biLSTM_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'lstm')
train_model(biLSTM_model, train_loader, val_loader, epochs, learning_rate)

# Train biGRU
print('Training biGRU Model')
biGRU_model = SentimentModel(embedding_matrix, hidden_size, num_layers, model_type = 'gru')
train_model(biGRU_model, train_loader, val_loader, epochs, learning_rate)

# Evaluate the final model on the test set
print('Evaluating biLSTM Model on Test Set')
test_accuracy_biLSTM = evaluate_model(biLSTM_model, test_loader)
print(f'Test Accuracy biLSTM: {test_accuracy_biLSTM:.4f}')

print('Evaluating biGRU Model on Test Set')
test_accuracy_biGRU = evaluate_model(biGRU_model, test_loader)
print(f'Test Accuracy biGRU: {test_accuracy_biGRU:.4f}')

Training biLSTM Model
Epoch 1/10, Train Loss: 0.6934, Val Accuracy: 0.5000
Epoch 2/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 3/10, Train Loss: 0.6935, Val Accuracy: 0.5000
Epoch 4/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 5/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 6/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 7/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 8/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 9/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 10/10, Train Loss: 0.6933, Val Accuracy: 0.5009
Training complete
Training biGRU Model
Epoch 1/10, Train Loss: 0.6939, Val Accuracy: 0.5000
Epoch 2/10, Train Loss: 0.6936, Val Accuracy: 0.5000
Epoch 3/10, Train Loss: 0.6935, Val Accuracy: 0.5000
Epoch 4/10, Train Loss: 0.6934, Val Accuracy: 0.5000
Epoch 5/10, Train Loss: 0.6933, Val Accuracy: 0.5000
Epoch 6/10, Train Loss: 0.6934, Val Accuracy: 0.5000
Epoch 7/10, Train Loss: 0.6934, Val Accuracy: 0.5000
Epoch 8/10, Train Loss: 0.6933, Val A