In [1]:
# Cell 1 — Imports
import os
import random
import re
import string
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Cell 2 — Config / Hyperparameters
DATA_DIR = "/kaggle/input/imdb-movie-review-sentiment-analysis"
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")

MAX_VOCAB_SIZE = 20000
MAX_LEN = 200
EMBEDDING_DIM = 128
HIDDEN_SIZE = 128
NUM_LAYERS = 1
BIDIRECTIONAL = True
BATCH_SIZE = 64
EPOCHS = 6
LR = 1e-3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cell 3 — Utility: simple tokenizer (Keras-like behavior)
class SimpleTokenizer:
    def __init__(self, num_words=None, oov_token='<OOV>'):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_counts = Counter()
        self.word_index = {}
        self.index_word = {}

    @staticmethod
    def _clean_text(text):
        text = text.lower()
        # remove punctuation (keep basic contractions if desired)
        text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def fit_on_texts(self, texts):
        for t in texts:
            t = '' if pd.isna(t) else t
            cleaned = self._clean_text(str(t))
            tokens = cleaned.split()
            self.word_counts.update(tokens)

        # most common
        most_common = self.word_counts.most_common(self.num_words) if self.num_words else self.word_counts.most_common()
        # reserve index=1.. for words; 1 is for most common word
        # index 1 reserved for OOV token
        self.word_index = {self.oov_token:1}
        idx = 2
        for w, _ in most_common:
            if w == self.oov_token:
                continue
            if w in self.word_index:
                continue
            self.word_index[w] = idx
            idx += 1
        self.index_word = {i:w for w,i in self.word_index.items()}

    def texts_to_sequences(self, texts):
        seqs = []
        for t in texts:
            t = '' if pd.isna(t) else t
            cleaned = self._clean_text(str(t))
            tokens = cleaned.split()
            seq = []
            for tok in tokens:
                if tok in self.word_index:
                    seq.append(self.word_index[tok])
                else:
                    seq.append(self.word_index[self.oov_token])
            seqs.append(seq)
        return seqs

# padding function
from typing import List

def pad_sequences(sequences: List[List[int]], maxlen: int, padding='post', truncating='post', value=0):
    padded = np.full((len(sequences), maxlen), value, dtype=np.int64)
    for i, seq in enumerate(sequences):
        if len(seq) == 0:
            continue
        if len(seq) <= maxlen:
            if padding == 'post':
                padded[i, :len(seq)] = seq
            else:
                padded[i, -len(seq):] = seq
        else:
            if truncating == 'post':
                seq = seq[:maxlen]
            else:
                seq = seq[-maxlen:]
            if padding == 'post':
                padded[i, :maxlen] = seq
            else:
                padded[i, :] = seq
    return padded

# Cell 4 — Load data
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
print('Train shape:', train.shape)
print('Test shape:', test.shape)

# train/val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['text'].astype(str),
    train['label'].astype(int),
    test_size=0.1,
    random_state=SEED,
    stratify=train['label']
)

# Cell 5 — Tokenizer + sequences
tokenizer = SimpleTokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

print('Vocab size (including OOV):', len(tokenizer.word_index))

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq   = tokenizer.texts_to_sequences(val_texts)
X_test_seq  = tokenizer.texts_to_sequences(test['text'].astype(str))

X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_val   = pad_sequences(X_val_seq,   maxlen=MAX_LEN)
X_test  = pad_sequences(X_test_seq,  maxlen=MAX_LEN)

y_train = train_labels.values.astype(np.int64)
y_val   = val_labels.values.astype(np.int64)

print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test.shape)

# Cell 6 — Dataset & DataLoader
class ReviewDataset(Dataset):
    def __init__(self, inputs, labels=None):
        self.inputs = torch.tensor(inputs, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float) if labels is not None else None

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        if self.labels is None:
            return self.inputs[idx]
        return self.inputs[idx], self.labels[idx]

train_dataset = ReviewDataset(X_train, y_train)
val_dataset   = ReviewDataset(X_val, y_val)
test_dataset  = ReviewDataset(X_test, None)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Cell 7 — Class weight (pos_weight for BCEWithLogitsLoss)
counts = np.bincount(y_train)
num_pos = counts[1]
num_neg = counts[0]
pos_weight = torch.tensor([num_neg / (num_pos + 1e-8)], dtype=torch.float).to(DEVICE)
print('Class counts (neg,pos):', num_neg, num_pos)
print('pos_weight for BCEWithLogitsLoss:', pos_weight.item())

# Cell 8 — Model definition
class BiLSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers=1, bidirectional=True, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0.0 if num_layers==1 else dropout
        )
        lstm_output_dim = hidden_size * (2 if bidirectional else 1)
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_output_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        emb = self.embedding(x)
        # pack/pad could be used — here we use fixed-length padded sequences
        output, (hn, cn) = self.lstm(emb)
        # use last hidden state(s)
        if self.lstm.bidirectional:
            # concatenate last forward and backward hidden
            last_hidden = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim=1)
        else:
            last_hidden = hn[-1,:,:]
        logits = self.fc(last_hidden).squeeze(1)
        return logits

vocab_size = min(MAX_VOCAB_SIZE + 2, len(tokenizer.word_index) + 2)  # +2 for padding index 0 and safety
model = BiLSTMSentiment(vocab_size=vocab_size, embed_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                        num_layers=NUM_LAYERS, bidirectional=BIDIRECTIONAL).to(DEVICE)

print(model)

# Cell 9 — Loss, optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# Cell 10 — Training loop
from tqdm import tqdm

best_val_f1 = 0.0
best_model_path = 'best_lstm_pytorch.pth'

for epoch in range(1, EPOCHS+1):
    model.train()
    train_losses = []
    for xb, yb in tqdm(train_loader, desc=f"Train Epoch {epoch}"):
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)

    # validation
    model.eval()
    val_preds = []
    val_trues = []
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            logits = model(xb)
            loss = criterion(logits, yb)
            val_losses.append(loss.item())
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            val_preds.extend(preds.tolist())
            val_trues.extend(yb.cpu().numpy().astype(int).tolist())

    val_acc = accuracy_score(val_trues, val_preds)
    val_f1 = f1_score(val_trues, val_preds, average='macro')
    avg_val_loss = np.mean(val_losses)

    print(f"Epoch {epoch} — train_loss: {avg_train_loss:.4f} | val_loss: {avg_val_loss:.4f} | val_acc: {val_acc:.4f} | val_macro_f1: {val_f1:.4f}")

    # save best
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print('Saved best model ->', best_model_path)

# load best model
model.load_state_dict(torch.load(best_model_path))

# Cell 11 — Full validation report
model.eval()
val_preds = []
val_trues = []
with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.5).astype(int)
        val_preds.extend(preds.tolist())
        val_trues.extend(yb.numpy().astype(int).tolist())

print('Val Accuracy:', accuracy_score(val_trues, val_preds))
print('Val Macro F1:', f1_score(val_trues, val_preds, average='macro'))
print(classification_report(val_trues, val_preds))

# Cell 12 — Predict on test and save submission
all_test_preds = []
model.eval()
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= 0.5).astype(int)
        all_test_preds.extend(preds.tolist())

submission = pd.DataFrame({
    'id': test['id'],
    'label': all_test_preds
})
submission.to_csv('submission_pytorch.csv', index=False)
print('✅ Saved submission_pytorch.csv')

# Cell 13 — Notes / next steps
# - To use pretrained embeddings (GloVe/fastText), load embedding vectors and assign to model.embedding.weight.data
# - Consider using PackedSequence + pack_padded_sequence for better performance on variable-length inputs
# - Try Transformer / HuggingFace models for stronger baselines
# - Tune MAX_LEN, MAX_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, and training schedule


Train shape: (32000, 3)
Test shape: (8000, 2)
Vocab size (including OOV): 20001
X_train shape: (28800, 200)
X_val shape: (3200, 200)
X_test shape: (8000, 200)
Class counts (neg,pos): 14413 14387
pos_weight for BCEWithLogitsLoss: 1.0018072128295898
BiLSTMSentiment(
  (embedding): Embedding(20002, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (fc): Sequential(
    (0): Dropout(p=0.3, inplace=False)
    (1): Linear(in_features=256, out_features=64, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)


Train Epoch 1: 100%|██████████| 450/450 [00:06<00:00, 73.45it/s]


Epoch 1 — train_loss: 0.6463 | val_loss: 0.6060 | val_acc: 0.6994 | val_macro_f1: 0.6987
Saved best model -> best_lstm_pytorch.pth


Train Epoch 2: 100%|██████████| 450/450 [00:05<00:00, 85.92it/s]


Epoch 2 — train_loss: 0.5950 | val_loss: 0.5636 | val_acc: 0.7009 | val_macro_f1: 0.6994
Saved best model -> best_lstm_pytorch.pth


Train Epoch 3: 100%|██████████| 450/450 [00:05<00:00, 85.49it/s]


Epoch 3 — train_loss: 0.5162 | val_loss: 0.5360 | val_acc: 0.7300 | val_macro_f1: 0.7300
Saved best model -> best_lstm_pytorch.pth


Train Epoch 4: 100%|██████████| 450/450 [00:05<00:00, 84.75it/s]


Epoch 4 — train_loss: 0.4151 | val_loss: 0.5072 | val_acc: 0.7569 | val_macro_f1: 0.7528
Saved best model -> best_lstm_pytorch.pth


Train Epoch 5: 100%|██████████| 450/450 [00:05<00:00, 84.30it/s]


Epoch 5 — train_loss: 0.3369 | val_loss: 0.4241 | val_acc: 0.8106 | val_macro_f1: 0.8105
Saved best model -> best_lstm_pytorch.pth


Train Epoch 6: 100%|██████████| 450/450 [00:05<00:00, 83.68it/s]


Epoch 6 — train_loss: 0.2588 | val_loss: 0.4133 | val_acc: 0.8222 | val_macro_f1: 0.8221
Saved best model -> best_lstm_pytorch.pth
Val Accuracy: 0.8221875
Val Macro F1: 0.8221387097300421
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1602
           1       0.81      0.84      0.83      1598

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200

✅ Saved submission_pytorch.csv
