In [None]:
# Notebook 4: RNN Model Training and Evaluation

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np

# Use your dataset and dataloader from Notebook 3
# Assuming 'dataloader' variable exists from Notebook 3

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the RNN model class
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, 1)


    def forward(self, x):
        embedded = self.embedding(x)                      # (B, L, E)
        out, hidden = self.rnn(embedded)                  # out: (B, L, H), hidden: (1, B, H)
        last_hidden = hidden[-1]                          # (B, H)
        logits = self.fc(last_hidden)                     # (B, 1)
        return logits.squeeze(1)                          # (B,)



# Hyperparameters
VOCAB_SIZE = len(word2idx)  # from Notebook 3
EMBED_DIM = 128
HIDDEN_DIM = 256
NUM_LAYERS = 1
LR = 0.02
EPOCHS = 5

# Initialize model, loss, optimizer
model = SimpleRNN(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS).to(device)
criterion = nn.BCEWithLogitsLoss()  # expects raw logits from model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


lengths = [len(seq[seq != 0]) for seq, _ in dataloader.dataset]
print(f"Average sequence length (non-padded): {np.mean(lengths):.2f}")

# Training loop
for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    total = 0
    
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)                     # raw logits
        loss = criterion(outputs, labels)
        loss.backward()

        # ✅ Inspect gradients of first trainable parameter
        for name, param in model.named_parameters():
            if param.requires_grad and param.grad is not None:
                print(f"{name}: grad mean = {param.grad.mean():.6f}, grad std = {param.grad.std():.6f}")
                break  # only inspect one param per batch

        optimizer.step()
        
        epoch_loss += loss.item() * inputs.size(0)
        probs = torch.sigmoid(outputs)
        preds = (probs >= 0.5).float()
        epoch_acc += (preds == labels).sum().item()
        total += inputs.size(0)
    
    print(f"Epoch {epoch}/{EPOCHS} - Loss: {epoch_loss/total:.4f} - Accuracy: {epoch_acc/total:.4f}")


# Save model checkpoint
torch.save(model.state_dict(), "rnn_model.pth")
print("Model saved as rnn_model.pth")

# Evaluation on training data (for demonstration)
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        preds = (outputs >= 0.5).float()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(all_labels, all_preds))

print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


Using device: cuda
Average sequence length (non-padded): 73.87
embedding.weight: grad mean = 0.000000, grad std = 0.000002
embedding.weight: grad mean = -0.000000, grad std = 0.000002
embedding.weight: grad mean = 0.000000, grad std = 0.000002
embedding.weight: grad mean = 0.000000, grad std = 0.000002
embedding.weight: grad mean = 0.000000, grad std = 0.000002
embedding.weight: grad mean = -0.000000, grad std = 0.000003
embedding.weight: grad mean = -0.000000, grad std = 0.000002
embedding.weight: grad mean = 0.000000, grad std = 0.000003
embedding.weight: grad mean = 0.000000, grad std = 0.000003
embedding.weight: grad mean = -0.000000, grad std = 0.000003
embedding.weight: grad mean = -0.000000, grad std = 0.000004
embedding.weight: grad mean = -0.000000, grad std = 0.000003
embedding.weight: grad mean = 0.000000, grad std = 0.000002
embedding.weight: grad mean = 0.000000, grad std = 0.000003
embedding.weight: grad mean = -0.000000, grad std = 0.000003
embedding.weight: grad mean = 

KeyboardInterrupt: 

: 

In [32]:
print(df['polarity'].value_counts())  # Should be roughly equal
print(df['polarity'].unique())        # Should be only 0 and 1


polarity
0    50000
1    50000
Name: count, dtype: int64
[0 1]
