In [5]:
import pandas as pd
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Load and clean data
df = pd.read_csv("IMDB_Dataset.csv")

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower()

df["review"] = df["review"].apply(clean_text)

# Encode labels
le = LabelEncoder()
df["sentiment"] = le.fit_transform(df["sentiment"])  # 0 = negative, 1 = positive


In [6]:
tokenized = df["review"].apply(lambda x: x.split())

def build_vocab(tokenized_texts, min_freq=2):
    counter = Counter()
    for tokens in tokenized_texts:
        counter.update(tokens)
    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(tokenized)


In [7]:
def numericalize(tokens, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = [torch.tensor(numericalize(t.split(), vocab), dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return padded, torch.tensor(labels)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2)

train_ds = IMDBDataset(X_train, y_train, vocab)
test_ds = IMDBDataset(X_test, y_test, vocab)

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)


In [9]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embeds = self.embedding(x)
        _, (hidden, _) = self.lstm(embeds)
        return self.sigmoid(self.fc(hidden[-1])).squeeze()


In [11]:
for inputs, labels in train_dl:
    print("Input shape:", inputs.shape)
    print("Label shape:", labels.shape)
    output = model(inputs)
    print("Output shape:", output.shape)
    print("Output values:", output[:5])
    break


Input shape: torch.Size([64, 748])
Label shape: torch.Size([64])
Output shape: torch.Size([64])
Output values: tensor([0.5089, 0.5089, 0.5089, 0.5089, 0.5089], grad_fn=<SliceBackward0>)


In [12]:
print(set(y_train))
print(train_ds.labels[:5])


{0, 1}
tensor([1., 1., 0., 1., 0.])


In [13]:
device = torch.device("cpu")
model = SentimentLSTM(vocab_size=len(vocab), embedding_dim=128, hidden_dim=128).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for inputs, labels in train_dl:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_dl):.4f}")


Epoch 1 Loss: 0.6931
Epoch 2 Loss: 0.6927
Epoch 3 Loss: 0.6911
Epoch 4 Loss: 0.6912
Epoch 5 Loss: 0.6884


In [14]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_dl:
        inputs, labels = inputs.to(device), labels.to(device)
        output = model(inputs)
        preds = (output >= 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.2f}")


Test Accuracy: 0.49


In [15]:
def predict_sentiment(text):
    text = clean_text(text)
    tokens = numericalize(text.split(), vocab)
    input_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        output = model(input_tensor)
        return "positive" if output.item() >= 0.5 else "negative"

# Try it
print(predict_sentiment("The movie was absolutely wonderful and emotional."))


positive
