# **Task 2 - Sentiment Classification**

RNN

In [1]:
import pandas as pd
df = pd.read_csv('./data/IMDMovieReview_processed.csv')
df.head(10)

Unnamed: 0,review,sentiment,preprocessed
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
5,"Probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea 70s first a...
8,Encouraged by the positive comments about this...,negative,encouraged positive comments film looking forw...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


Since the dataset is already preprocessed, we just can train directly

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np

from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence

In [9]:
texts = list(df['preprocessed'].values)
labels = list(df['sentiment'].values)

In [10]:
# Basic tokenizer
def tokenize(text):
    return text.split()

# Build vocab
all_tokens = [token for text in texts for token in tokenize(text)]
vocab = {'<PAD>': 0, '<UNK>': 1}
vocab.update({word: idx + 2 for idx, (word, _) in enumerate(Counter(all_tokens).items())})

# Encode tokens
def encode(text):
    return [vocab.get(token, vocab['<UNK>']) for token in tokenize(text)]

In [13]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)  # 'positive'->1, 'negative'->0

In [15]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encoded_texts = [torch.tensor(encode(text), dtype=torch.long) for text in texts]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.encoded_texts[idx], self.labels[idx]

# Collate function to pad batches
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(x) for x in texts])
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<PAD>'])
    labels = torch.tensor(labels, dtype=torch.float32)
    return padded_texts, lengths, labels

In [16]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels_encoded, test_size=0.2, random_state=42)
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [17]:
class SimpleRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        embedded = self.embedding(x)  # [B, T, E]
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, hidden = self.rnn(packed)
        out = self.fc(hidden.squeeze(0))  # hidden: [1, B, H] → [B, H]
        return self.sigmoid(out).squeeze()

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleRNNClassifier(vocab_size=len(vocab), embedding_dim=100, hidden_dim=64).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [19]:
TOTAL_EPOCH = 5
for epoch in range(TOTAL_EPOCH):
    model.train()
    total_loss = 0
    for batch in train_loader:
        x, lengths, y = [b.to(device) for b in batch]
        optimizer.zero_grad()
        preds = model(x, lengths)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1, Loss: 0.6519
Epoch 2, Loss: 0.6067
Epoch 3, Loss: 0.5208
Epoch 4, Loss: 0.4226
Epoch 5, Loss: 0.3685


In [20]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        x, lengths, y = [b.to(device) for b in batch]
        preds = model(x, lengths)
        predicted = (preds > 0.5).float()
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

In [21]:
all_preds = np.array(all_preds).astype(int)
all_labels = np.array(all_labels).astype(int)

# Generate report
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.80      0.79      0.80      4961
    positive       0.80      0.80      0.80      5039

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000

