In [41]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [42]:
# Load CSV (your sampled cleaned train data)
df = pd.read_csv(r"C:\Users\abhis\OneDrive\Desktop\week4\Sequence Modelling Basics\data\cleaned_sampled_train.csv")

# Map polarity labels to 0 and 1 (for BCE loss)
df['polarity'] = df['polarity'].map({1: 0, 2: 1})

print("Unique polarity values after mapping:", df['polarity'].unique())
print(f"Dataset size: {len(df)}")


Unique polarity values after mapping: [0 1]
Dataset size: 100000


In [43]:
# Simple tokenizer function
def simple_tokenizer(text):
    return text.lower().strip().split()

df['tokens'] = df['cleaned_review'].apply(simple_tokenizer)

# Load vocabulary (word2idx dict)
with open(r'C:\Users\abhis\OneDrive\Desktop\week4\Sequence Modelling Basics\word2idx.pkl', 'rb') as f:
    word2idx = pickle.load(f)

print(f"Loaded vocab size: {len(word2idx)}")


Loaded vocab size: 20000


In [44]:
# Convert tokens to indices (with UNK fallback)
def tokens_to_indices(tokens):
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

df['indexed_tokens'] = df['tokens'].apply(tokens_to_indices)

# Pad sequences to max length
max_len = 150
df['padded_tokens'] = list(pad_sequences(df['indexed_tokens'], maxlen=max_len, padding='post', truncating='post'))

print(df[['padded_tokens', 'polarity']].head())


                                       padded_tokens  polarity
0  [2, 355, 3981, 108, 16, 355, 3291, 40, 5, 1026...         0
1  [4, 200, 2, 1, 1, 1, 714, 3981, 19, 209, 11078...         0
2  [10, 160, 127, 91, 11, 2, 6168, 1, 527, 2875, ...         0
3  [10, 295, 58, 1918, 6, 51, 143, 8, 132, 10, 10...         0
4  [8, 14, 5, 99, 7, 5, 1, 9, 1137, 67, 4, 1, 39,...         0


In [45]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create Dataset and DataLoader
batch_size = 128
dataset = ReviewDataset(df['padded_tokens'].tolist(), df['polarity'])
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Check batch shapes
texts, labels = next(iter(train_loader))
print("Batch text shape:", texts.shape)
print("Batch label shape:", labels.shape)


Batch text shape: torch.Size([128, 150])
Batch label shape: torch.Size([128])


In [46]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)  # [batch, seq_len, emb_dim]
        output, hidden = self.rnn(embedded)  # hidden: [1, batch, hidden_dim]
        out = hidden.squeeze(0)  # [batch, hidden_dim]
        out = self.fc(out)  # [batch, output_dim]
        return self.sigmoid(out).squeeze(1)  # [batch]

# Hyperparameters
vocab_size = len(word2idx)
embedding_dim = 100
hidden_dim = 128
output_dim = 1
padding_idx = word2idx['<PAD>']

model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx).to(device)
print(model)


RNNClassifier(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (rnn): RNN(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [47]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [48]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total = 0

    print(f"Number of batches in dataloader: {len(dataloader)}", flush=True)

    for texts, labels in dataloader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)

        print("Labels (first 5):", labels[:5], flush=True)
        print("Outputs (first 5):", outputs[:5], flush=True)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * texts.size(0)

        preds = (outputs >= 0.5).float()
        correct_preds += (preds == labels).sum().item()
        total += labels.size(0)

    # These won't run yet because of break, but keep for later
    epoch_loss = running_loss / total if total > 0 else 0
    epoch_acc = correct_preds / total if total > 0 else 0
    return epoch_loss, epoch_acc


In [49]:
print(f"Device: {device}")

# Check first batch from dataloader and print shapes and dtype
for texts, labels in dataloader:
    print("Batch texts shape:", texts.shape)
    print("Batch labels shape:", labels.shape)
    print("Batch texts device:", texts.device if hasattr(texts, 'device') else 'CPU')
    print("Batch labels device:", labels.device if hasattr(labels, 'device') else 'CPU')
    print("Sample labels (first 5):", labels[:5])
    break


Device: cuda


NameError: name 'dataloader' is not defined

In [None]:
num_epochs = 5
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Train Accuracy = {train_acc:.4f}")


Number of batches in dataloader: 782
Labels (first 5): tensor([1., 1., 1., 1., 1.], device='cuda:0')
Outputs (first 5): tensor([0.5013, 0.5013, 0.5013, 0.5013, 0.5013], device='cuda:0',
       grad_fn=<SliceBackward0>)
Epoch 1: Train Loss = 0.0000, Train Accuracy = 0.0000
Number of batches in dataloader: 782
Labels (first 5): tensor([1., 0., 1., 0., 1.], device='cuda:0')
Outputs (first 5): tensor([0.5013, 0.5255, 0.5013, 0.5583, 0.5013], device='cuda:0',
       grad_fn=<SliceBackward0>)
Epoch 2: Train Loss = 0.0000, Train Accuracy = 0.0000
Number of batches in dataloader: 782
Labels (first 5): tensor([1., 0., 0., 0., 0.], device='cuda:0')
Outputs (first 5): tensor([0.5013, 0.5013, 0.5013, 0.5013, 0.5013], device='cuda:0',
       grad_fn=<SliceBackward0>)
Epoch 3: Train Loss = 0.0000, Train Accuracy = 0.0000
Number of batches in dataloader: 782
Labels (first 5): tensor([0., 1., 0., 1., 0.], device='cuda:0')
Outputs (first 5): tensor([0.5013, 0.5013, 0.5013, 0.5013, 0.5013], device='cuda