# Part 1

Build a movie review sentiment classifier using GloVe  and RNNs

Tasks:
1. Train a model using GloVE embeddings with Vanilla RNNs
2. Train a model using GloVE embeddings with LSTMs
3. Repeat [1] and [2] with on-the-fly embeddings using torch

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\kshit\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


## Approach Overview
We will build a movie review sentiment classifier using the IMDB dataset, GloVe embeddings, and RNN-based models. The steps are:
1. **Preprocess the IMDB dataset**: Clean and tokenize the text, split into train/test sets.
2. **Load GloVe embeddings**: Download and prepare GloVe word vectors for use in our models.
3. **Prepare data for PyTorch**: Create datasets and dataloaders for training and evaluation.
4. **Model 1: Vanilla RNN with GloVe**: Build and train a simple RNN using pre-trained GloVe embeddings.
5. **Model 2: LSTM with GloVe**: Build and train an LSTM using pre-trained GloVe embeddings.
6. **Model 3: Vanilla RNN with trainable embeddings**: Use a randomly initialized embedding layer, trained on-the-fly.
7. **Model 4: LSTM with trainable embeddings**: Same as above, but with LSTM.
8. **Evaluate all models**: Compare accuracy and performance on the test set.

## Preprocessing the IMDB Dataset

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from tqdm import tqdm
tqdm.pandas()

# Load IMDB dataset
data_path = os.path.join(path, 'IMDB Dataset.csv')
df = pd.read_csv(data_path)
print('Dataset shape:', df.shape)
print(df.head())

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['review'] = df['review'].progress_apply(clean_text)

# Encode sentiment
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print('Train:', train_df.shape, 'Test:', test_df.shape)

## Loading GloVe Embeddings

In [None]:
import requests
import zipfile

# Download GloVe embeddings (100d)
glove_dir = './glove.6B'
glove_file = os.path.join(glove_dir, 'glove.6B.100d.txt')
if not os.path.exists(glove_file):
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    zip_path = 'glove.6B.zip'
    print('Downloading GloVe embeddings...')
    r = requests.get(url, stream=True)
    with open(zip_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print('Extracting...')
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)
    os.remove(zip_path)
else:
    print('GloVe embeddings already present.')

# Load GloVe vectors into a dictionary
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings(glove_file)
print('Loaded GloVe embeddings:', len(glove_embeddings))

## Creating Vocabulary

In [None]:
from torchtext.vocab import Vocab
from collections import Counter
import torch

# Tokenize reviews
def tokenize(text):
    return text.split()

# Build vocabulary from training data
counter = Counter()
for review in train_df['review']:
    counter.update(tokenize(review))

vocab_size = 20000  # Limit vocab size for efficiency
vocab = Vocab(counter, max_size=vocab_size, specials=['<unk>', '<pad>'])

# Prepare embedding matrix for GloVe
embedding_dim = 100
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for i, token in enumerate(vocab.get_itos()):
    vector = glove_embeddings.get(token)
    if vector is not None:
        embedding_matrix[i] = vector
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Numericalize text
def numericalize(text):
    return [vocab[token] for token in tokenize(text)]

train_df['input_ids'] = train_df['review'].progress_apply(numericalize)
test_df['input_ids'] = test_df['review'].progress_apply(numericalize)

## Creating Datasets and Dataloaders

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils

class IMDBDataset(Dataset):
    def __init__(self, df, max_len=200):
        self.input_ids = df['input_ids'].tolist()
        self.labels = df['label'].tolist()
        self.max_len = max_len
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        ids = self.input_ids[idx][:self.max_len]
        length = len(ids)
        if length < self.max_len:
            ids = ids + [vocab['<pad>']] * (self.max_len - length)
        return torch.tensor(ids, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float32)

max_len = 200
batch_size = 128

train_dataset = IMDBDataset(train_df, max_len=max_len)
test_dataset = IMDBDataset(test_df, max_len=max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
import torch.nn as nn

class VanillaRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=1, num_classes=1, dropout=0.2):
        super(VanillaRNN, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True, padding_idx=vocab['<pad>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]  # last hidden state
        out = self.fc(out)
        return self.sigmoid(out).squeeze()

# Instantiate model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
rnn_model = VanillaRNN(embedding_matrix).to(device)

In [None]:
import torch.optim as optim

def train_model(model, train_loader, test_loader, epochs=3, lr=1e-3):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')
    return model

def evaluate_model(model, data_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = (outputs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    acc = correct / total
    print(f'Accuracy: {acc:.4f}')
    return acc

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=1, num_classes=1, dropout=0.2):
        super(LSTMModel, self).__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=True, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return self.sigmoid(out).squeeze()

# Instantiate and train LSTM model
lstm_model = LSTMModel(embedding_matrix).to(device)

In [None]:
# Vanilla RNN with trainable embeddings
class VanillaRNNTrainable(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, num_layers=1, num_classes=1, dropout=0.2):
        super(VanillaRNNTrainable, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return self.sigmoid(out).squeeze()

# LSTM with trainable embeddings
class LSTMTrainable(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, num_layers=1, num_classes=1, dropout=0.2):
        super(LSTMTrainable, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return self.sigmoid(out).squeeze()