In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [4]:
# !pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [1]:
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
import re

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
#Load the dataset as a dataframe
dataset = load_dataset("imdb")
train = pd.DataFrame(dataset['train'])
test = pd.DataFrame(dataset['test'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# Convert sentiment labels to binary format (positive=1, negative=0)
train['label'] = train['label'].apply(lambda x: 1 if x == 1 else 0)
test['label'] = test['label'].apply(lambda x: 1 if x == 1 else 0)

# Function to preprocess text by removing HTML tags and tokenizing
def preprocess(text):
    text = re.sub(r'<[^>]+>', '', text)  # Strip out HTML tags
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize
    return tokens

# Apply preprocessing to both train and test sets
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)

In [6]:
import urllib.request
import io
import zipfile

def load_glove_embeddings_from_file(file_like, embedding_dim):
    embeddings = {}
    for line in file_like:
        values = line.decode('utf-8').split()  # Split the line into word and vector
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')  # Convert vector values to float32
        embeddings[word] = vector  # Store in dictionary
    return embeddings

# Download the GloVe embedding file and extract it
url = 'https://nlp.stanford.edu/data/glove.6B.zip'
response = urllib.request.urlopen(url)
with io.BytesIO(response.read()) as file_stream:
    with zipfile.ZipFile(file_stream) as zip_file:
        with zip_file.open('glove.6B.100d.txt') as glove_file:
            embedding_dim = 100
            glove_embeddings = load_glove_embeddings_from_file(glove_file, embedding_dim)

In [8]:
# Build the vocabulary from the training data
vocab = set([word for review in train['text'] for word in review])
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}  # Map words to indices (start from 1)
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Initialize the embedding matrix using GloVe embeddings
embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))  # +1 to handle padding index
for word, i in word_to_idx.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
# Custom dataset class to handle IMDB data
class IMDBDataset(Dataset):
    def __init__(self, data, word_to_idx, max_length=100):
        self.data = data
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        # Convert words to indices, and pad/truncate to max_length
        encoded_review = [self.word_to_idx.get(word, 0) for word in review]
        padded_review = encoded_review[:self.max_length] + [0] * (self.max_length - len(encoded_review))
        return torch.tensor(padded_review), torch.tensor(label)

In [10]:
train_dataset = IMDBDataset(train, word_to_idx)
test_dataset = IMDBDataset(test, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [11]:
# Define RNN model with pretrained GloVe embeddings
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden[-1])

# Define LSTM model with pretrained GloVe embeddings
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

In [12]:
# Training function
def train_model(model, train_loader, optimizer, criterion, n_epochs=5):
    model.train()
    for epoch in range(n_epochs):
        epoch_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')

# Initialize and train RNN and LSTM models with GloVe embeddings
rnn_model = RNNModel(len(vocab) + 1, embedding_dim, 128, 1, embedding_matrix)
lstm_model = LSTMModel(len(vocab) + 1, embedding_dim, 128, 1, embedding_matrix)

optimizer_rnn = optim.Adam(rnn_model.parameters(), lr=0.001)
optimizer_lstm = optim.Adam(lstm_model.parameters(), lr=0.001)

criterion = nn.BCEWithLogitsLoss()

train_model(rnn_model, train_loader, optimizer_rnn, criterion)
train_model(lstm_model, train_loader, optimizer_lstm, criterion)


Epoch 1/5, Loss: 0.6905
Epoch 2/5, Loss: 0.6826
Epoch 3/5, Loss: 0.6302
Epoch 4/5, Loss: 0.6821
Epoch 5/5, Loss: 0.6431
Epoch 1/5, Loss: 0.6660
Epoch 2/5, Loss: 0.5272
Epoch 3/5, Loss: 0.3124
Epoch 4/5, Loss: 0.1760
Epoch 5/5, Loss: 0.0848


In [13]:
# Define models with on-the-fly (trainable) embeddings
class RNNModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Trainable embeddings
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden[-1])

class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Trainable embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])


In [14]:
# Initialize and train on-the-fly RNN and LSTM models
rnn_model_on_the_fly = RNNModelOnTheFly(len(vocab) + 1, embedding_dim, 128, 1)
lstm_model_on_the_fly = LSTMModelOnTheFly(len(vocab) + 1, embedding_dim, 128, 1)

optimizer_rnn_fly = optim.Adam(rnn_model_on_the_fly.parameters(), lr=0.001)
optimizer_lstm_fly = optim.Adam(lstm_model_on_the_fly.parameters(), lr=0.001)

train_model(rnn_model_on_the_fly, train_loader, optimizer_rnn_fly, criterion)
train_model(lstm_model_on_the_fly, train_loader, optimizer_lstm_fly, criterion)


Epoch 1/5, Loss: 0.6955
Epoch 2/5, Loss: 0.6912
Epoch 3/5, Loss: 0.6854
Epoch 4/5, Loss: 0.6634
Epoch 5/5, Loss: 0.6350
Epoch 1/5, Loss: 0.6819
Epoch 2/5, Loss: 0.6256
Epoch 3/5, Loss: 0.5172
Epoch 4/5, Loss: 0.4010
Epoch 5/5, Loss: 0.3050


In [15]:
# Evaluation function to calculate accuracy
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = torch.round(torch.sigmoid(outputs.squeeze()))  # Use sigmoid for binary classification
            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# Evaluate all models
rnn_acc = evaluate_model(rnn_model, test_loader)
lstm_acc = evaluate_model(lstm_model, test_loader)
rnn_fly_acc = evaluate_model(rnn_model_on_the_fly, test_loader)
lstm_fly_acc = evaluate_model(lstm_model_on_the_fly, test_loader)

# Print the accuracy results
print(f'RNN with GloVe Accuracy: {rnn_acc:.4f}')
print(f'LSTM with GloVe Accuracy: {lstm_acc:.4f}')
print(f'RNN with On-the-Fly Embeddings Accuracy: {rnn_fly_acc:.4f}')
print(f'LSTM with On-the-Fly Embeddings Accuracy: {lstm_fly_acc:.4f}')

RNN with GloVe Accuracy: 0.5556
LSTM with GloVe Accuracy: 0.7222
RNN with On-the-Fly Embeddings Accuracy: 0.5884
LSTM with On-the-Fly Embeddings Accuracy: 0.7767
