In [None]:
!pip install pandas numpy torch==2.0.1 torchtext==0.15.2 nltk scikit-learn

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv("IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
tokenizer = get_tokenizer("basic_english")
stop_words = set(stopwords.words("english"))

def tokenize(text):
    tokens = tokenizer(text.lower())
    return [t for t in tokens if t not in stop_words]

df['tokens'] = df['review'].apply(tokenize)

In [None]:
from collections import Counter
import torch
from torchtext.vocab import vocab

# Assuming df['tokens'] is a column of tokenized lists
counter = Counter()
for tokens in df['tokens']:
    counter.update(tokens)

# Use `vocab(...)` directly — no need for torchtext.vocab.vocab
vocab_obj = vocab(counter, specials=["<unk>", "<pad>"])
vocab_obj.set_default_index(vocab_obj["<unk>"])

# Example usage
print(vocab_obj["<pad>"])
print(vocab_obj["some_word"])

1
0


In [None]:
# Numericalize function
def numericalize(tokens):
    return [vocab_obj[token] for token in tokens]

df['numerical'] = df['tokens'].apply(numericalize)

# Pad sequences
MAX_LEN = 300

def pad_input(seq):
    seq = seq[:MAX_LEN]
    return torch.tensor(seq + [vocab_obj["<pad>"]] * (MAX_LEN - len(seq)))

df['padded'] = df['numerical'].apply(pad_input)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

In [None]:
import torch
from sklearn.model_selection import train_test_split

# Inputs
X = torch.stack(df['padded'].tolist())
y = torch.tensor(df['sentiment'].tolist(), dtype=torch.long)

# Convert to list for train_test_split
X_list = X.tolist()
y_list = y.tolist()

# Train-test split
X_train_list, X_test_list, y_train_list, y_test_list = train_test_split(X_list, y_list, test_size=0.2)

# Convert back to tensors
X_train = torch.tensor(X_train_list, dtype=torch.long)
X_test = torch.tensor(X_test_list, dtype=torch.long)
y_train = torch.tensor(y_train_list, dtype=torch.long)
y_test = torch.tensor(y_test_list, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

In [None]:
import torch
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(RNNClassifier, self).__init__()
        embedding_dim = embedding_matrix.shape[1]
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)                           # [batch_size, seq_len, emb_dim]
        output, hidden = self.rnn(embedded)                    # hidden: [1, batch_size, hidden_dim]
        out = self.fc(hidden.squeeze(0))                       # [batch_size, 1]
        return torch.sigmoid(out).squeeze(1)                   # [batch_size]

In [None]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTMClassifier, self).__init__()
        embedding_dim = embedding_matrix.shape[1]  # safer than hardcoding 100
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)                      # [batch_size, seq_len, emb_dim]
        _, (hidden, _) = self.lstm(embedded)              # hidden: [1, batch_size, 128]
        out = self.fc(hidden.squeeze(0))                  # [batch_size, 1]
        return torch.sigmoid(out).squeeze(1)              # [batch_size]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def train_model(model, loader, epochs=5):
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device).float()

            optimizer.zero_grad()
            outputs = model(xb)  # shape: [batch_size]
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Calculate accuracy during training (optional)
            preds = (outputs >= 0.5).float()
            correct += (preds == yb).sum().item()
            total += yb.size(0)

        avg_loss = total_loss / len(loader)
        accuracy = correct / total * 100
        print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%")

In [None]:
def evaluate(model, loader):
    model = model.to(device)
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device).float()  # Ensure yb is float for comparison
            outputs = model(xb)
            preds = (outputs >= 0.5).float()
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# -------------------------
# Device configuration
# -------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# -------------------------
# Dummy data (replace with real data)
# -------------------------
# Assume each input sequence is 100-dimensional and of length 50
vocab_size = 5000
embedding_dim = 100
sequence_length = 50
num_samples = 1000

X = torch.randint(0, vocab_size, (num_samples, sequence_length))
y = torch.randint(0, 2, (num_samples,)).float()

# -------------------------
# Embedding matrix (random for demo)
# Replace with pretrained like GloVe if available
# -------------------------
embedding_matrix = torch.randn(vocab_size, embedding_dim)

# -------------------------
# Dataset and DataLoader
# -------------------------
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# -------------------------
# Model Definitions
# -------------------------
class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.rnn = nn.RNN(embedding_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return torch.sigmoid(self.fc(hidden.squeeze(0)))

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        return torch.sigmoid(self.fc(hidden.squeeze(0)))

# -------------------------
# Training function
# -------------------------
def train_model(model, loader, epochs=5):
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device).float()
            optimizer.zero_grad()
            outputs = model(xb).squeeze()
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

# -------------------------
# Evaluation function
# -------------------------
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device).float()
            preds = (model(xb).squeeze() > 0.5).float()
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    accuracy = 100 * correct / total if total != 0 else 0
    print(f"Accuracy: {accuracy:.2f}%")

# -------------------------
# Run RNN
# -------------------------
print("Training RNN...")
rnn_model = RNNClassifier(embedding_matrix)
train_model(rnn_model, train_loader)
evaluate(rnn_model, test_loader)

# -------------------------
# Run LSTM
# -------------------------
print("\nTraining LSTM...")
lstm_model = LSTMClassifier(embedding_matrix)
train_model(lstm_model, train_loader)
evaluate(lstm_model, test_loader)


Training RNN...
Epoch 1, Loss: 0.7032
Epoch 2, Loss: 0.6154
Epoch 3, Loss: 0.4966
Epoch 4, Loss: 0.3175
Epoch 5, Loss: 0.1441
Accuracy: 49.00%

Training LSTM...
Epoch 1, Loss: 0.6913
Epoch 2, Loss: 0.6488
Epoch 3, Loss: 0.5712
Epoch 4, Loss: 0.4147
Epoch 5, Loss: 0.2134
Accuracy: 53.00%
