# Helper Functions

In [24]:
import json
import time
import random
from collections import defaultdict, Counter
import itertools

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        embeds = self.embedding(x)
        _, (h_n, _) = self.lstm(embeds)
        forward_final = h_n[-2]
        backward_final = h_n[-1]
        last_hidden = torch.cat((forward_final, backward_final), dim=1)
        logits = self.fc(self.dropout(last_hidden))
        return logits.squeeze()

In [26]:
def load_json_lines(path):
    texts, labels = [], []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
            labels.append(ex['label'])
    return texts, labels

def load_test_json(path):
    texts = []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
    return texts


In [27]:
def create_weighted_sampler(domains, labels):
    from collections import Counter
    from torch.utils.data import WeightedRandomSampler

    bucket_keys = list(zip(domains, labels))
    bucket_counts = Counter(bucket_keys)
    bucket_weights = {k: 1.0 / count for k, count in bucket_counts.items()}
    sample_weights = torch.DoubleTensor([bucket_weights[k] for k in bucket_keys])
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
    return sampler

In [28]:
def predict_with_confidence(model, dataloader, device="cpu"):
    model.eval()
    preds, probs = [], []

    with torch.no_grad():
        for (X_batch,) in tqdm(dataloader, desc="🔍 Predicting"):
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            batch_probs = torch.sigmoid(logits).squeeze()
            if batch_probs.ndim == 0:
                batch_probs = batch_probs.unsqueeze(0)
            batch_preds = (batch_probs >= 0.5).int().tolist()
            preds.extend(batch_preds)
            probs.extend(batch_probs.cpu().tolist())

    return preds, probs

# Validation

In [29]:
def stratified_train_val_split(texts, labels, domains, val_size_per_group=60, random_state=42):
    random.seed(random_state)
    from collections import defaultdict
    buckets = defaultdict(list)

    for x, y, d in zip(texts, labels, domains):
        buckets[(d, y)].append((x, y, d))

    train, val = [], []
    for key in buckets:
        group = buckets[key]
        random.shuffle(group)
        n_val = min(val_size_per_group, len(group))
        val.extend(group[:n_val])
        train.extend(group[n_val:])

    random.shuffle(train)
    random.shuffle(val)
    tx, ty, td = zip(*train)
    vx, vy, vd = zip(*val)
    return list(tx), list(ty), list(td), list(vx), list(vy), list(vd)


In [30]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

def validate_return_acc(model, dataloader, all_labels, all_domains, device="cpu"):
    model.eval()
    all_preds, all_targets = [], []

    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            probs = torch.sigmoid(logits).squeeze()
            preds = (probs >= 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y_batch.cpu().numpy())

    acc = accuracy_score(all_targets, all_preds)
    print(f"\n✅ Validation Accuracy: {acc:.4f}")
    return acc

def train_with_validation(model, train_loader, val_loader, val_labels, val_domains, epochs=5, lr=1e-3, device="cpu"):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    pos_weight = torch.tensor([
        sum(1 for y in train_loader.dataset.tensors[1] if y == 0) /
        sum(1 for y in train_loader.dataset.tensors[1] if y == 1)
    ], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    val_accuracies = []

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"📦 Epoch {epoch}", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"\n📦 Epoch {epoch} — Train Loss: {total_loss / len(train_loader):.4f}")
        val_acc = validate_return_acc(model, val_loader, val_labels, val_domains, device)
        val_accuracies.append(val_acc)

    return val_accuracies


In [31]:
# Load full data
d1_texts, d1_labels = load_json_lines("data/domain1_train_data.json")
d2_texts, d2_labels = load_json_lines("data/domain2_train_data.json")
texts = d1_texts + d2_texts
labels = d1_labels + d2_labels
domains = [0] * len(d1_labels) + [1] * len(d2_labels)

# Stratified validation split
train_texts, train_labels, train_domains, val_texts, val_labels, val_domains = stratified_train_val_split(
    texts, labels, domains, val_size_per_group=60
)

# Tensorize training data
X_train = pad_sequence(train_texts, batch_first=True, padding_value=0)
y_train = torch.tensor(train_labels, dtype=torch.float32)
train_dataset = TensorDataset(X_train, y_train)

# Sampler for training
sampler = create_weighted_sampler(train_domains, train_labels)
train_loader = DataLoader(train_dataset, batch_size=64, sampler=sampler)

# Tensorize validation
X_val = pad_sequence(val_texts, batch_first=True, padding_value=0)
y_val = torch.tensor(val_labels, dtype=torch.float32)
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Model
model = BiLSTMClassifier(vocab_size=17120)

In [32]:
# Define grid of values to test
param_grid = {
    'hidden_dim': [128, 256],
    'num_layers': [1, 2],
    'dropout': [0.3, 0.5],
    'lr': [1e-3, 5e-4],
    'batch_size': [32, 64],
}

# Expand grid
param_combinations = list(itertools.product(*param_grid.values()))
param_names = list(param_grid.keys())

In [None]:
results = []

for params in param_combinations:
    config = dict(zip(param_names, params))
    print(f"\n🔍 Testing config: {config}")

    model = BiLSTMClassifier(
        vocab_size=17120,
        embedding_dim=128,
        hidden_dim=config['hidden_dim'],
        num_layers=config['num_layers'],
        dropout=config['dropout']
    )

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], sampler=sampler)

    val_accuracies = train_with_validation(
        model, train_loader, val_loader,
        val_labels, val_domains,
        epochs=10, lr=config['lr'], device=device
    )

    results.append({
        **config,
        "val_accuracy_final": val_accuracies[-1],
        "val_accuracies": val_accuracies
    })


🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1465

✅ Validation Accuracy: 0.7958


                                                             


📦 Epoch 2 — Train Loss: 0.0548

✅ Validation Accuracy: 0.8500


                                                             


📦 Epoch 3 — Train Loss: 0.0187

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 4 — Train Loss: 0.0114

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 5 — Train Loss: 0.0121

✅ Validation Accuracy: 0.8458


                                                             


📦 Epoch 6 — Train Loss: 0.0043

✅ Validation Accuracy: 0.8667


                                                             


📦 Epoch 7 — Train Loss: 0.0081

✅ Validation Accuracy: 0.8292


                                                             


📦 Epoch 8 — Train Loss: 0.0044

✅ Validation Accuracy: 0.8583


                                                             


📦 Epoch 9 — Train Loss: 0.0028

✅ Validation Accuracy: 0.8083


                                                              


📦 Epoch 10 — Train Loss: 0.0058

✅ Validation Accuracy: 0.8042

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64}


                                                           


📦 Epoch 1 — Train Loss: 0.1850

✅ Validation Accuracy: 0.6458


                                                           


📦 Epoch 2 — Train Loss: 0.0733

✅ Validation Accuracy: 0.8833


                                                           


📦 Epoch 3 — Train Loss: 0.0327

✅ Validation Accuracy: 0.8875


                                                           


📦 Epoch 4 — Train Loss: 0.0160

✅ Validation Accuracy: 0.8750


                                                           


📦 Epoch 5 — Train Loss: 0.0098

✅ Validation Accuracy: 0.8583


                                                           


📦 Epoch 6 — Train Loss: 0.0119

✅ Validation Accuracy: 0.8250


                                                           


📦 Epoch 7 — Train Loss: 0.0050

✅ Validation Accuracy: 0.8625


                                                           


📦 Epoch 8 — Train Loss: 0.0034

✅ Validation Accuracy: 0.8458


                                                           


📦 Epoch 9 — Train Loss: 0.0022

✅ Validation Accuracy: 0.8500


                                                            


📦 Epoch 10 — Train Loss: 0.0016

✅ Validation Accuracy: 0.8333

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.3, 'lr': 0.0005, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1831

✅ Validation Accuracy: 0.5875


                                                             


📦 Epoch 2 — Train Loss: 0.0836

✅ Validation Accuracy: 0.8667


                                                             


📦 Epoch 3 — Train Loss: 0.0590

✅ Validation Accuracy: 0.8000


                                                             


📦 Epoch 4 — Train Loss: 0.0273

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 5 — Train Loss: 0.0120

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 6 — Train Loss: 0.0088

✅ Validation Accuracy: 0.8458


                                                             


📦 Epoch 7 — Train Loss: 0.0121

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 8 — Train Loss: 0.0051

✅ Validation Accuracy: 0.8417


                                                             


📦 Epoch 9 — Train Loss: 0.0068

✅ Validation Accuracy: 0.8583


                                                              


📦 Epoch 10 — Train Loss: 0.0038

✅ Validation Accuracy: 0.8250

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.3, 'lr': 0.0005, 'batch_size': 64}


                                                           


📦 Epoch 1 — Train Loss: 0.2231

✅ Validation Accuracy: 0.5000


                                                           


📦 Epoch 2 — Train Loss: 0.1224

✅ Validation Accuracy: 0.7542


                                                           


📦 Epoch 3 — Train Loss: 0.0681

✅ Validation Accuracy: 0.8458


                                                           


📦 Epoch 4 — Train Loss: 0.0377

✅ Validation Accuracy: 0.8708


                                                           


📦 Epoch 5 — Train Loss: 0.0205

✅ Validation Accuracy: 0.8875


                                                           


📦 Epoch 6 — Train Loss: 0.0189

✅ Validation Accuracy: 0.8500


                                                           


📦 Epoch 7 — Train Loss: 0.0097

✅ Validation Accuracy: 0.8417


                                                           


📦 Epoch 8 — Train Loss: 0.0062

✅ Validation Accuracy: 0.8292


                                                           


📦 Epoch 9 — Train Loss: 0.0048

✅ Validation Accuracy: 0.8333


                                                            


📦 Epoch 10 — Train Loss: 0.0041

✅ Validation Accuracy: 0.8125

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.5, 'lr': 0.001, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1527

✅ Validation Accuracy: 0.7958


                                                             


📦 Epoch 2 — Train Loss: 0.0734

✅ Validation Accuracy: 0.8542


                                                             


📦 Epoch 3 — Train Loss: 0.0249

✅ Validation Accuracy: 0.8750


                                                             


📦 Epoch 4 — Train Loss: 0.0134

✅ Validation Accuracy: 0.8417


                                                             


📦 Epoch 5 — Train Loss: 0.0070

✅ Validation Accuracy: 0.8542


                                                             


📦 Epoch 6 — Train Loss: 0.0069

✅ Validation Accuracy: 0.8500


                                                             


📦 Epoch 7 — Train Loss: 0.0034

✅ Validation Accuracy: 0.8292


                                                             


📦 Epoch 8 — Train Loss: 0.0035

✅ Validation Accuracy: 0.8292


                                                             


📦 Epoch 9 — Train Loss: 0.0026

✅ Validation Accuracy: 0.8208


                                                              


📦 Epoch 10 — Train Loss: 0.0035

✅ Validation Accuracy: 0.8542

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.5, 'lr': 0.001, 'batch_size': 64}


                                                           


📦 Epoch 1 — Train Loss: 0.1879

✅ Validation Accuracy: 0.6083


                                                           


📦 Epoch 2 — Train Loss: 0.0830

✅ Validation Accuracy: 0.8458


                                                           


📦 Epoch 3 — Train Loss: 0.0447

✅ Validation Accuracy: 0.8500


                                                           


📦 Epoch 4 — Train Loss: 0.0169

✅ Validation Accuracy: 0.8625


                                                           


📦 Epoch 5 — Train Loss: 0.0118

✅ Validation Accuracy: 0.8833


                                                           


📦 Epoch 6 — Train Loss: 0.0075

✅ Validation Accuracy: 0.8417


                                                           


📦 Epoch 7 — Train Loss: 0.0099

✅ Validation Accuracy: 0.8542


                                                           


📦 Epoch 8 — Train Loss: 0.0031

✅ Validation Accuracy: 0.8292


                                                           


📦 Epoch 9 — Train Loss: 0.0079

✅ Validation Accuracy: 0.8333


                                                            


📦 Epoch 10 — Train Loss: 0.0026

✅ Validation Accuracy: 0.8333

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.5, 'lr': 0.0005, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1975

✅ Validation Accuracy: 0.5667


                                                             


📦 Epoch 2 — Train Loss: 0.0920

✅ Validation Accuracy: 0.8458


                                                             


📦 Epoch 3 — Train Loss: 0.0407

✅ Validation Accuracy: 0.9042


                                                             


📦 Epoch 4 — Train Loss: 0.0240

✅ Validation Accuracy: 0.8708


                                                             


📦 Epoch 5 — Train Loss: 0.0163

✅ Validation Accuracy: 0.9125


                                                             


📦 Epoch 6 — Train Loss: 0.0088

✅ Validation Accuracy: 0.8958


                                                             


📦 Epoch 7 — Train Loss: 0.0052

✅ Validation Accuracy: 0.7875


                                                             


📦 Epoch 8 — Train Loss: 0.0051

✅ Validation Accuracy: 0.8958


                                                             


📦 Epoch 9 — Train Loss: 0.0035

✅ Validation Accuracy: 0.8083


                                                              


📦 Epoch 10 — Train Loss: 0.0029

✅ Validation Accuracy: 0.8417

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.5, 'lr': 0.0005, 'batch_size': 64}


                                                           


📦 Epoch 1 — Train Loss: 0.2270

✅ Validation Accuracy: 0.5000


                                                           


📦 Epoch 2 — Train Loss: 0.1350

✅ Validation Accuracy: 0.7083


                                                           


📦 Epoch 3 — Train Loss: 0.0795

✅ Validation Accuracy: 0.8250


                                                           


📦 Epoch 4 — Train Loss: 0.0441

✅ Validation Accuracy: 0.8500


                                                           


📦 Epoch 5 — Train Loss: 0.0265

✅ Validation Accuracy: 0.8500


                                                           


📦 Epoch 6 — Train Loss: 0.0151

✅ Validation Accuracy: 0.8292


                                                           


📦 Epoch 7 — Train Loss: 0.0069

✅ Validation Accuracy: 0.8542


                                                           


📦 Epoch 8 — Train Loss: 0.0125

✅ Validation Accuracy: 0.8125


                                                           


📦 Epoch 9 — Train Loss: 0.0051

✅ Validation Accuracy: 0.8333


                                                            


📦 Epoch 10 — Train Loss: 0.0207

✅ Validation Accuracy: 0.8250

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1309

✅ Validation Accuracy: 0.8208


                                                             


📦 Epoch 2 — Train Loss: 0.0381

✅ Validation Accuracy: 0.8542


                                                             


📦 Epoch 3 — Train Loss: 0.0155

✅ Validation Accuracy: 0.8167


                                                             


📦 Epoch 4 — Train Loss: 0.0162

✅ Validation Accuracy: 0.8417


                                                             


📦 Epoch 5 — Train Loss: 0.0039

✅ Validation Accuracy: 0.8375


                                                             


📦 Epoch 6 — Train Loss: 0.0038

✅ Validation Accuracy: 0.8292


                                                             


📦 Epoch 7 — Train Loss: 0.0035

✅ Validation Accuracy: 0.8250


                                                             


📦 Epoch 8 — Train Loss: 0.0092

✅ Validation Accuracy: 0.8250


                                                             


📦 Epoch 9 — Train Loss: 0.0045

✅ Validation Accuracy: 0.8208


                                                              


📦 Epoch 10 — Train Loss: 0.0019

✅ Validation Accuracy: 0.8625

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64}


                                                           


📦 Epoch 1 — Train Loss: 0.1600

✅ Validation Accuracy: 0.7917


                                                           


📦 Epoch 2 — Train Loss: 0.0597

✅ Validation Accuracy: 0.8708


                                                           


📦 Epoch 3 — Train Loss: 0.0298

✅ Validation Accuracy: 0.8083


                                                           


📦 Epoch 4 — Train Loss: 0.0150

✅ Validation Accuracy: 0.8583


                                                           


📦 Epoch 5 — Train Loss: 0.0125

✅ Validation Accuracy: 0.8333


                                                           


📦 Epoch 6 — Train Loss: 0.0067

✅ Validation Accuracy: 0.7625


                                                           


📦 Epoch 7 — Train Loss: 0.0077

✅ Validation Accuracy: 0.8333


                                                           


📦 Epoch 8 — Train Loss: 0.0058

✅ Validation Accuracy: 0.8000


                                                           


📦 Epoch 9 — Train Loss: 0.0024

✅ Validation Accuracy: 0.8000


                                                            


📦 Epoch 10 — Train Loss: 0.0011

✅ Validation Accuracy: 0.7750

🔍 Testing config: {'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.0005, 'batch_size': 32}


                                                             


📦 Epoch 1 — Train Loss: 0.1483

✅ Validation Accuracy: 0.8417


📦 Epoch 2:  61%|██████    | 110/180 [00:16<00:09,  7.07it/s]

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="val_accuracy_final", ascending=False)

print("\n🏁 Top Configs by Final Validation Accuracy:")
print(results_df[["hidden_dim", "num_layers", "dropout", "lr", "batch_size", "val_accuracy_final"]].to_string(index=False))

# Final Prediction

In [16]:
# ------------------ Final Training & Test Submission ------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load full training data
d1_texts, d1_labels = load_json_lines("data/domain1_train_data.json")
d2_texts, d2_labels = load_json_lines("data/domain2_train_data.json")
texts = d1_texts + d2_texts
labels = d1_labels + d2_labels
domains = [0] * len(d1_labels) + [1] * len(d2_labels)

# Tensorize full data
X_all = pad_sequence(texts, batch_first=True, padding_value=0)
y_all = torch.tensor(labels, dtype=torch.float32)
train_dataset = TensorDataset(X_all, y_all)

# Create weighted sampler
sampler = create_weighted_sampler(domains, labels)
train_loader = DataLoader(train_dataset, batch_size=64, sampler=sampler)

# Define model and train
model = BiLSTMClassifier(vocab_size=17120)

def train(model, train_loader, epochs=10, lr=1e-3, device="cpu"):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    pos_weight = torch.tensor([
        sum(1 for y in train_loader.dataset.tensors[1] if y == 0) /
        sum(1 for y in train_loader.dataset.tensors[1] if y == 1)
    ], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"📦 Epoch {epoch}", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"\n📦 Epoch {epoch} — Train Loss: {total_loss / len(train_loader):.4f}")

train(model, train_loader, epochs=5, device=device)

# Predict on test set
test_texts = load_test_json("data/test_data.json")
X_test = pad_sequence(test_texts, batch_first=True, padding_value=0)
test_dataset = TensorDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=64)

start = time.time()
test_preds, test_probs = predict_with_confidence(model, test_loader, device=device)
print(f"🕒 Prediction completed in {time.time() - start:.2f} seconds.")

# Save outputs
results_df = pd.DataFrame({
    "id": list(range(len(test_preds))),
    "label": test_preds,
    "confidence": test_probs
})
results_df.to_csv("full_test_predictions.csv", index=False)
results_df[["id", "label"]].to_csv("submission.csv", index=False)
print("✅ Saved: submission.csv and full_test_predictions.csv")


                                                           


📦 Epoch 1 — Train Loss: 0.1563


                                                           


📦 Epoch 2 — Train Loss: 0.0611


                                                           


📦 Epoch 3 — Train Loss: 0.0391


                                                           


📦 Epoch 4 — Train Loss: 0.0213


                                                           


📦 Epoch 5 — Train Loss: 0.0186


🔍 Predicting:   8%|▊         | 5/63 [01:15<14:37, 15.13s/it]


KeyboardInterrupt: 