<a href="https://colab.research.google.com/github/adithya98765/Linguix/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import time
import kagglehub
from kagglehub import KaggleDatasetAdapter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

#Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}\n")

#Load & Preprocess
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "rupindersinghrana/gender-by-name",
    "name_gender_dataset.csv"
)
df.columns = df.columns.str.lower()
# Filter out rare names (count >= 100) if available
if 'count' in df.columns:
    df = df[df['count'] >= 100]
# Keep only name & gender
df = df[['name', 'gender']]
# Normalize
df['name'] = df['name'].str.lower()
df['gender'] = df['gender'].str.upper().map({'M':0, 'F':1})
df = df.dropna(subset=['name','gender'])
print(f"Dataset size after filtering: {len(df)} samples\n")

# Vocabulary
names = df['name'].tolist()
chars = sorted(set(''.join(names)))
char2idx = {'<PAD>':0}
for i, c in enumerate(chars, start=1):
    char2idx[c] = i
vocab_size = len(char2idx)
print(f"Vocab size (including PAD): {vocab_size}\n")

# Dataset & DataLoader
class NameDataset(Dataset):
    def __init__(self, names, labels, char2idx):
        self.names = names
        self.labels = labels
        self.char2idx = char2idx
    def __len__(self):
        return len(self.names)
    def __getitem__(self, idx):
        seq = [self.char2idx.get(c, 0) for c in self.names[idx]]
        return torch.tensor(seq, dtype=torch.long),
        torch.tensor(self.labels[idx], dtype=torch.long)

def collate_fn(batch):
    seqs, labels = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    return padded, lengths, torch.stack(labels)

# Train/Val/Test split with stratification
df_names = df['name'].tolist()
df_labels = df['gender'].tolist()
X_train, X_tmp, y_train, y_tmp = train_test_split(
    df_names, df_labels, test_size=0.2, stratify=df_labels, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)
print(f"Train/Val/Test sizes: {len(X_train)}/{len(X_val)}/{len(X_test)}\n")

batch_size = 128
train_loader = DataLoader(NameDataset(X_train, y_train, char2idx), batch_size,
                          shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(NameDataset(X_val,   y_val,   char2idx), batch_size,
                          shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(NameDataset(X_test,  y_test,  char2idx), batch_size,
                          shuffle=False, collate_fn=collate_fn)

# Model Definition
class GenderRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, mlp_hidden,
                 num_classes, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            emb_dim, hid_dim, num_layers,
            batch_first=True, bidirectional=True, dropout=dropout
        )
        self.fc1 = nn.Linear(hid_dim*2, mlp_hidden)
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(mlp_hidden, num_classes)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (hn, _) = self.lstm(packed)
        h = torch.cat([hn[-2], hn[-1]], dim=1)
        h = self.drop(torch.relu(self.fc1(h)))
        return self.fc2(h)

# Instantiate model
model = GenderRNN(
    vocab_size=vocab_size,
    emb_dim=200,
    hid_dim=512,
    num_layers=2,
    mlp_hidden=256,
    num_classes=2,
    dropout=0.3
).to(device)
print(model, "\n")

# Loss, Optimizer, Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

epochs = 20
patience = 3
wait = 0
best_val = 0.0

# Training Loop
for epoch in range(1, epochs+1):
    start_time = time.time()
    model.train()
    train_loss, train_correct, train_total = 0, 0, 0
    for seqs, lengths, labels in train_loader:
        seqs, lengths, labels = seqs.to(device), lengths.to(device),
        labels.to(device)
        outputs = model(seqs, lengths)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)
    train_loss /= train_total
    train_acc = train_correct / train_total * 100

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for seqs, lengths, labels in val_loader:
            seqs, lengths, labels = seqs.to(device), lengths.to(device),
            labels.to(device)
            outputs = model(seqs, lengths)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)
    val_loss /= val_total
    val_acc = val_correct / val_total * 100
    scheduler.step(val_acc)

    print(f"Epoch {epoch}/{epochs} - {time.time()-start_time:.1f}s - "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%")

    if val_acc > best_val:
        best_val = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping")
            break

# Test Evaluation
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for seqs, lengths, labels in test_loader:
        seqs, lengths, labels = seqs.to(device), lengths.to(device),
        labels.to(device)
        preds = model(seqs, lengths).argmax(1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
print(f"\nTest Accuracy: {correct/total*100:.2f}%")

# Predictions on Unseen Names
def predict_gender(name):
    seq = torch.tensor([char2idx.get(c,0) for c in name.lower()],
                       dtype=torch.long).unsqueeze(0).to(device)
    length = torch.tensor([len(name)], dtype=torch.long)
    with torch.no_grad():
        pred = model(seq, length).argmax(1).item()
    return 'Female' if pred==1 else 'Male'

unseen = ["Noah","Sophia","Liam","Emma","Olivia","Mason","Ava","Elijah",
          "Isabella","Logan"]
print("\nUnseen predictions:")
for name in unseen:
    print(f"{name} -> {predict_gender(name)}")


Using device: cuda



  df = kagglehub.load_dataset(


Dataset size after filtering: 41366 samples

Vocab size (including PAD): 32

Train/Val/Test sizes: 33092/4137/4137

GenderRNN(
  (embedding): Embedding(32, 200, padding_idx=0)
  (lstm): LSTM(200, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=256, bias=True)
  (drop): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=256, out_features=2, bias=True)
) 





Epoch 1/20 - 5.9s - Train Loss: 0.4590, Acc: 78.61% | Val Loss: 0.4369, Acc: 80.54%
Epoch 2/20 - 5.8s - Train Loss: 0.4105, Acc: 81.53% | Val Loss: 0.4219, Acc: 81.05%
Epoch 3/20 - 6.0s - Train Loss: 0.3845, Acc: 82.89% | Val Loss: 0.4131, Acc: 81.68%
Epoch 4/20 - 6.0s - Train Loss: 0.3637, Acc: 84.17% | Val Loss: 0.4180, Acc: 81.31%
Epoch 5/20 - 6.1s - Train Loss: 0.3456, Acc: 84.81% | Val Loss: 0.4141, Acc: 81.92%
Epoch 6/20 - 5.9s - Train Loss: 0.3265, Acc: 85.57% | Val Loss: 0.4244, Acc: 81.05%
Epoch 7/20 - 5.9s - Train Loss: 0.3105, Acc: 86.21% | Val Loss: 0.4296, Acc: 81.12%
Epoch 8/20 - 5.8s - Train Loss: 0.2895, Acc: 86.97% | Val Loss: 0.4645, Acc: 81.00%
Early stopping

Test Accuracy: 83.08%

Unseen predictions:
Noah -> Female
Sophia -> Female
Liam -> Male
Emma -> Female
Olivia -> Female
Mason -> Male
Ava -> Female
Elijah -> Male
Isabella -> Female
Logan -> Female
