In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/aabdollahii/university-questions/train.json
/kaggle/input/datasets/aabdollahii/university-questions/test.json
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/spacy_loggers-1.0.5-py3-none-any.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/cymem-2.0.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/__script__.py
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/shellingham-1.5.4-py2.py3-none-any.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/setuptools-82.0.0-py3-none-any.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/huggingface_hub-0.34.6-py3-none-any.whl
/kaggle/input/pm-109376216-at-02-13-2026-10-41-16/nvidia_cuda_runtime_cu12-12.8.90-py3-

In [2]:
# ============================================================
#  FULL PIPELINE: LSTM for Persian Question Ambiguity Detection
# ============================================================
#  Stage 1: Preprocessing V2 (train + test)
#  Stage 2: Vocabulary & Dataset
#  Stage 3: LSTM Model Definition
#  Stage 4: Training with dev (train-as-dev) monitoring
#  Stage 5: Final test evaluation & model saving
# ============================================================

import json
import re
import os
import pickle
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    f1_score, accuracy_score
)

from hazm import Normalizer, word_tokenize

# ============================================================
#  CONFIG
# ============================================================
class Config:
    # Paths
    TRAIN_PATH = "/kaggle/input/datasets/aabdollahii/university-questions/train.json"
    TEST_PATH = "/kaggle/input/datasets/aabdollahii/university-questions/test.json"
    SAVE_DIR = "/kaggle/working/"
    
    # Preprocessing
    MAX_LEN = 64           # max tokens per question (will verify from data)
    MIN_FREQ = 2           # min word frequency to include in vocab
    
    # Model
    EMBED_DIM = 128
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.3
    BIDIRECTIONAL = True
    
    # Training
    BATCH_SIZE = 32
    EPOCHS = 10
    LR = 1e-3
    WEIGHT_DECAY = 1e-5
    PATIENCE = 7           # early stopping patience
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Reproducibility
    SEED = 42

cfg = Config()

# Set seeds
torch.manual_seed(cfg.SEED)
np.random.seed(cfg.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(cfg.SEED)

print(f"Device: {cfg.DEVICE}")
print(f"PyTorch version: {torch.__version__}")

  from google.cloud.aiplatform.utils import gcs_utils


Device: cpu
PyTorch version: 2.8.0+cu126


In [3]:
# ============================================================
#  STAGE 1: PREPROCESSING V2
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 1: PREPROCESSING V2")
print("=" * 65)

formal_normalizer = Normalizer()

def normalize_v2(text):
    """
    V2 normalization pipeline — safe version (no InformalNormalizer).
    1. Hazm formal normalization (handles ی/ک, spacing, etc.)
    2. Arabic char normalization
    3. Clean punctuation/extra whitespace
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Step 1: Hazm formal normalization
    text = formal_normalizer.normalize(text)
    
    # Step 2: Additional Arabic → Persian char normalization
    text = text.replace("ي", "ی").replace("ك", "ک")
    text = text.replace("ؤ", "و").replace("إ", "ا").replace("أ", "ا")
    text = text.replace("ة", "ه")
    
    # Step 3: Normalize various dashes and special chars
    text = re.sub(r'[ـ]+', '', text)              # remove kashida (tatweel)
    text = re.sub(r'[‌]+', ' ', text)              # replace ZWNJ with space (hazm handles most)
    
    # Step 4: Keep Persian/Arabic letters, digits, basic punctuation, spaces
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF'
                  r'a-zA-Z0-9۰-۹٠-٩\s\.\?\!،؛]', ' ', text)
    
    # Step 5: Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def tokenize_text(text):
    """Tokenize using Hazm word_tokenize after normalization."""
    if not text:
        return []
    return word_tokenize(text)


# --- Load Data ---
print("Loading train.json ...")
with open(cfg.TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)
df_train = pd.DataFrame(train_data)
print(f"  Train shape: {df_train.shape}")
print(f"  Label distribution:\n{df_train['is_ambiguous'].value_counts().to_string()}")

print("\nLoading test.json ...")
with open(cfg.TEST_PATH, "r", encoding="utf-8") as f:
    test_data = json.load(f)
df_test = pd.DataFrame(test_data)
print(f"  Test shape: {df_test.shape}")
has_test_labels = "is_ambiguous" in df_test.columns
if has_test_labels:
    print(f"  Test label distribution:\n{df_test['is_ambiguous'].value_counts().to_string()}")

# --- Apply Normalization ---
print("\nNormalizing train questions ...")
df_train["norm_text"] = df_train["question"].apply(normalize_v2)
print("Normalizing test questions ...")
df_test["norm_text"] = df_test["question"].apply(normalize_v2)

# --- Tokenize ---
print("Tokenizing train ...")
df_train["tokens"] = df_train["norm_text"].apply(tokenize_text)
print("Tokenizing test ...")
df_test["tokens"] = df_test["norm_text"].apply(tokenize_text)

# --- Show Samples ---
print("\n--- Train Samples ---")
for i in range(5):
    print(f"  [{df_train['is_ambiguous'].iloc[i]}] {df_train['question'].iloc[i]}")
    print(f"       → {df_train['tokens'].iloc[i][:15]} ...")
    print()

# --- Sequence Length Analysis ---
train_lengths = df_train["tokens"].apply(len)
print(f"Token length stats (train):")
print(f"  Mean:   {train_lengths.mean():.1f}")
print(f"  Median: {train_lengths.median():.1f}")
print(f"  95th %: {train_lengths.quantile(0.95):.0f}")
print(f"  99th %: {train_lengths.quantile(0.99):.0f}")
print(f"  Max:    {train_lengths.max()}")

# Update MAX_LEN based on data (cover 95th percentile)
suggested_max_len = int(train_lengths.quantile(0.95)) + 2
if suggested_max_len != cfg.MAX_LEN:
    print(f"\n   Updating MAX_LEN: {cfg.MAX_LEN} → {suggested_max_len}")
    cfg.MAX_LEN = suggested_max_len


  STAGE 1: PREPROCESSING V2
Loading train.json ...
  Train shape: (900, 4)
  Label distribution:
is_ambiguous
0    450
1    450

Loading test.json ...
  Test shape: (100, 4)
  Test label distribution:
is_ambiguous
0    50
1    50

Normalizing train questions ...
Normalizing test questions ...
Tokenizing train ...
Tokenizing test ...

--- Train Samples ---
  [0] حداقل نمره قبولی در هر درس برای دانشجویان مقطع کارشناسی چند است؟
       → ['حداقل', 'نمره', 'قبولی', 'در', 'هر', 'درس', 'برای', 'دانشجویان', 'مقطع', 'کارشناسی', 'چند', 'است', '؟'] ...

  [0] حداکثر سنوات مجاز تحصیل در دوره کارشناسی پیوسته چند نیمسال است؟
       → ['حداکثر', 'سنوات', 'مجاز', 'تحصیل', 'در', 'دوره', 'کارشناسی', 'پیوسته', 'چند', 'نیمسال', 'است', '؟'] ...

  [0] آیا دانشجوی کارشناسی می‌تواند با معدل بالای ۱۷، بیش از ۲۰ واحد در ترم بعد اخذ کند؟
       → ['آیا', 'دانشجوی', 'کارشناسی', 'می', 'تواند', 'با', 'معدل', 'بالای', '۱۷', '،', 'بیش', 'از', '۲۰', 'واحد', 'در'] ...

  [0] دانشجوی کارشناسی ارشد حداکثر چند نیمسال می

In [4]:
# ============================================================
#  STAGE 2: VOCABULARY & DATASET
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 2: VOCABULARY & DATASET")
print("=" * 65)

# --- Build Vocabulary (from TRAINING data only) ---
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
PAD_IDX = 0
UNK_IDX = 1

word_counts = Counter()
for tokens in df_train["tokens"]:
    word_counts.update(tokens)

print(f"Total unique words in train: {len(word_counts)}")

# Filter by min frequency
vocab_words = [w for w, c in word_counts.items() if c >= cfg.MIN_FREQ]
vocab_words.sort()  # deterministic order

word2idx = {PAD_TOKEN: PAD_IDX, UNK_TOKEN: UNK_IDX}
for w in vocab_words:
    word2idx[w] = len(word2idx)

idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx)

print(f"Vocab size (min_freq={cfg.MIN_FREQ}): {vocab_size}")
print(f"  (including <PAD> and <UNK>)")

# Check OOV rate on test
if len(df_test) > 0:
    test_tokens_all = [t for tokens in df_test["tokens"] for t in tokens]
    oov_count = sum(1 for t in test_tokens_all if t not in word2idx)
    print(f"Test OOV rate: {oov_count}/{len(test_tokens_all)} = "
          f"{oov_count/max(len(test_tokens_all),1)*100:.1f}%")


def encode_tokens(tokens, word2idx, max_len):
    """Convert token list to padded index array."""
    ids = [word2idx.get(t, UNK_IDX) for t in tokens[:max_len]]
    length = len(ids)
    # Pad
    ids = ids + [PAD_IDX] * (max_len - length)
    return ids, length


# --- PyTorch Dataset ---
class QuestionDataset(Dataset):
    def __init__(self, tokens_list, labels, word2idx, max_len):
        self.tokens_list = tokens_list
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len
    
    def __len__(self):
        return len(self.tokens_list)
    
    def __getitem__(self, idx):
        tokens = self.tokens_list[idx]
        label = self.labels[idx]
        
        ids, length = encode_tokens(tokens, self.word2idx, self.max_len)
        
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "length": torch.tensor(length, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.float)
        }


# --- Prepare splits ---
# Full training set
train_tokens = df_train["tokens"].tolist()
train_labels = df_train["is_ambiguous"].values.astype(int)

# Test set
test_tokens = df_test["tokens"].tolist()
if has_test_labels:
    test_labels = df_test["is_ambiguous"].values.astype(int)
else:
    test_labels = np.zeros(len(df_test), dtype=int)  # placeholder

# Dev set = train set (as per your request, for debugging)
dev_tokens = train_tokens
dev_labels = train_labels

# Create datasets
train_dataset = QuestionDataset(train_tokens, train_labels, word2idx, cfg.MAX_LEN)
dev_dataset   = QuestionDataset(dev_tokens, dev_labels, word2idx, cfg.MAX_LEN)
test_dataset  = QuestionDataset(test_tokens, test_labels, word2idx, cfg.MAX_LEN)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE, shuffle=True)
dev_loader   = DataLoader(dev_dataset, batch_size=cfg.BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=cfg.BATCH_SIZE, shuffle=False)

print(f"\nDataLoaders ready:")
print(f"  Train: {len(train_dataset)} samples, {len(train_loader)} batches")
print(f"  Dev:   {len(dev_dataset)} samples (= train, for debugging)")
print(f"  Test:  {len(test_dataset)} samples")

# --- Compute class weights ---
n_pos = train_labels.sum()
n_neg = len(train_labels) - n_pos
pos_weight = torch.tensor([n_neg / max(n_pos, 1)], dtype=torch.float).to(cfg.DEVICE)
print(f"\nClass balance: neg={n_neg}, pos={n_pos}")
print(f"  pos_weight for BCEWithLogitsLoss: {pos_weight.item():.4f}")


  STAGE 2: VOCABULARY & DATASET
Total unique words in train: 1155
Vocab size (min_freq=2): 652
  (including <PAD> and <UNK>)
Test OOV rate: 90/1026 = 8.8%

DataLoaders ready:
  Train: 900 samples, 29 batches
  Dev:   900 samples (= train, for debugging)
  Test:  100 samples

Class balance: neg=450, pos=450
  pos_weight for BCEWithLogitsLoss: 1.0000


In [5]:
# ============================================================
#  STAGE 3: LSTM MODEL
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 3: LSTM MODEL DEFINITION")
print("=" * 65)

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers,
                 dropout, bidirectional, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(
            vocab_size, embed_dim, padding_idx=pad_idx
        )
        
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # If bidirectional, hidden output is 2 * hidden_dim
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # Attention-like pooling: combine last hidden state + max pool + mean pool
        self.fc = nn.Sequential(
            nn.Linear(lstm_output_dim * 3, lstm_output_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(lstm_output_dim, 1)
        )
    
    def forward(self, input_ids, lengths):
        # input_ids: (batch, max_len)
        # lengths:   (batch,)
        
        embedded = self.dropout(self.embedding(input_ids))
        # embedded: (batch, max_len, embed_dim)
        
        # Pack padded sequences for efficient LSTM
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu().clamp(min=1),
            batch_first=True, enforce_sorted=False
        )
        
        lstm_out, (hidden, cell) = self.lstm(packed)
        
        # Unpack
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(
            lstm_out, batch_first=True, total_length=input_ids.size(1)
        )
        # lstm_out: (batch, max_len, lstm_output_dim)
        
        # Create mask for padded positions
        mask = (input_ids != PAD_IDX).unsqueeze(-1).float()
        # mask: (batch, max_len, 1)
        
        # --- Pooling strategies ---
        # 1. Last hidden state (concat forward + backward for bidirectional)
        if self.lstm.bidirectional:
            # hidden: (num_layers * 2, batch, hidden_dim)
            last_hidden = torch.cat(
                [hidden[-2], hidden[-1]], dim=-1
            )  # (batch, hidden_dim * 2)
        else:
            last_hidden = hidden[-1]  # (batch, hidden_dim)
        
        # 2. Max pooling (masked)
        lstm_out_masked = lstm_out * mask + (1 - mask) * (-1e9)
        max_pool, _ = lstm_out_masked.max(dim=1)  # (batch, lstm_output_dim)
        
        # 3. Mean pooling (masked)
        sum_pool = (lstm_out * mask).sum(dim=1)  # (batch, lstm_output_dim)
        lengths_expanded = lengths.unsqueeze(-1).float().clamp(min=1).to(sum_pool.device)
        mean_pool = sum_pool / lengths_expanded
        
        # Concatenate all three
        combined = torch.cat([last_hidden, max_pool, mean_pool], dim=-1)
        
        logits = self.fc(self.dropout(combined)).squeeze(-1)
        # logits: (batch,)
        
        return logits


# Instantiate
model = LSTMClassifier(
    vocab_size=vocab_size,
    embed_dim=cfg.EMBED_DIM,
    hidden_dim=cfg.HIDDEN_DIM,
    num_layers=cfg.NUM_LAYERS,
    dropout=cfg.DROPOUT,
    bidirectional=cfg.BIDIRECTIONAL,
    pad_idx=PAD_IDX
).to(cfg.DEVICE)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nModel Architecture:")
print(model)
print(f"\nTotal parameters:     {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


  STAGE 3: LSTM MODEL DEFINITION

Model Architecture:
LSTMClassifier(
  (embedding): Embedding(652, 128, padding_idx=0)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
)

Total parameters:     940,033
Trainable parameters: 940,033


In [6]:
# ============================================================
#  STAGE 4: TRAINING WITH DEV MONITORING
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 4: TRAINING")
print("=" * 65)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(
    model.parameters(), lr=cfg.LR, weight_decay=cfg.WEIGHT_DECAY
)
scheduler = ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=3
)


def evaluate(model, loader, criterion, device):
    """Evaluate model on a dataloader. Returns loss, preds, labels."""
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    n_batches = 0
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            lengths = batch["length"].to(device)
            labels = batch["label"].to(device)
            
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            n_batches += 1
            
            preds = (torch.sigmoid(logits) >= 0.5).long().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy().astype(int))
    
    avg_loss = total_loss / max(n_batches, 1)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    f1 = f1_score(all_labels, all_preds, average="macro")
    acc = accuracy_score(all_labels, all_preds)
    
    return avg_loss, f1, acc, all_preds, all_labels


# --- Training Loop ---
best_dev_f1 = 0.0
patience_counter = 0
history = {"train_loss": [], "dev_loss": [], "dev_f1": [], "dev_acc": [], "lr": []}

print(f"\nStarting training for {cfg.EPOCHS} epochs ...")
print(f"{'Epoch':>5} | {'Train Loss':>10} | {'Dev Loss':>10} | "
      f"{'Dev F1':>8} | {'Dev Acc':>8} | {'LR':>10} | {'Status'}")
print("-" * 85)

for epoch in range(1, cfg.EPOCHS + 1):
    # --- Train ---
    model.train()
    train_loss_sum = 0.0
    n_train_batches = 0
    
    for batch in train_loader:
        input_ids = batch["input_ids"].to(cfg.DEVICE)
        lengths = batch["length"].to(cfg.DEVICE)
        labels = batch["label"].to(cfg.DEVICE)
        
        optimizer.zero_grad()
        logits = model(input_ids, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        train_loss_sum += loss.item()
        n_train_batches += 1
    
    avg_train_loss = train_loss_sum / max(n_train_batches, 1)
    
    # --- Evaluate on dev (= train, for debugging) ---
    dev_loss, dev_f1, dev_acc, dev_preds, dev_labels = evaluate(
        model, dev_loader, criterion, cfg.DEVICE
    )
    
    # --- LR scheduler ---
    current_lr = optimizer.param_groups[0]["lr"]
    scheduler.step(dev_f1)
    
    # --- Track history ---
    history["train_loss"].append(avg_train_loss)
    history["dev_loss"].append(dev_loss)
    history["dev_f1"].append(dev_f1)
    history["dev_acc"].append(dev_acc)
    history["lr"].append(current_lr)
    
    # --- Early stopping & checkpointing ---
    status = ""
    if dev_f1 > best_dev_f1:
        best_dev_f1 = dev_f1
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), os.path.join(cfg.SAVE_DIR, "best_lstm.pt"))
        status = "★ BEST"
    else:
        patience_counter += 1
        if patience_counter >= cfg.PATIENCE:
            status = " STOP"
        
    print(f"{epoch:>5} | {avg_train_loss:>10.4f} | {dev_loss:>10.4f} | "
          f"{dev_f1:>8.4f} | {dev_acc:>8.4f} | {current_lr:>10.6f} | {status}")
    
    if patience_counter >= cfg.PATIENCE:
        print(f"\n  Early stopping triggered at epoch {epoch} (patience={cfg.PATIENCE})")
        break

print(f"\n  Best Dev F1-Macro: {best_dev_f1:.4f}")

# --- Dev set detailed report (at best checkpoint) ---
print("\n--- Dev Set Report (Train-as-Dev, Best Checkpoint) ---")
model.load_state_dict(torch.load(os.path.join(cfg.SAVE_DIR, "best_lstm.pt")))

dev_loss, dev_f1, dev_acc, dev_preds, dev_labels = evaluate(
    model, dev_loader, criterion, cfg.DEVICE
)

print(f"\nDev F1-Macro:  {dev_f1:.4f}")
print(f"Dev Accuracy:  {dev_acc:.4f}")
print(f"\nClassification Report:")
print(classification_report(
    dev_labels, dev_preds,
    target_names=["Not Ambiguous", "Ambiguous"],
    digits=4
))

cm = confusion_matrix(dev_labels, dev_preds)
print(f"Confusion Matrix:")
print(f"                  Pred_NotAmb  Pred_Amb")
print(f"  True_NotAmb     {cm[0][0]:>8}     {cm[0][1]:>8}")
print(f"  True_Amb        {cm[1][0]:>8}     {cm[1][1]:>8}")



  STAGE 4: TRAINING

Starting training for 10 epochs ...
Epoch | Train Loss |   Dev Loss |   Dev F1 |  Dev Acc |         LR | Status
-------------------------------------------------------------------------------------
    1 |     0.5566 |     0.4265 |   0.8229 |   0.8256 |   0.001000 | ★ BEST
    2 |     0.3643 |     0.2443 |   0.8977 |   0.8978 |   0.001000 | ★ BEST
    3 |     0.2690 |     0.1780 |   0.9378 |   0.9378 |   0.001000 | ★ BEST
    4 |     0.2146 |     0.1137 |   0.9600 |   0.9600 |   0.001000 | ★ BEST
    5 |     0.1845 |     0.0846 |   0.9744 |   0.9744 |   0.001000 | ★ BEST
    6 |     0.1314 |     0.0571 |   0.9867 |   0.9867 |   0.001000 | ★ BEST
    7 |     0.0950 |     0.0328 |   0.9878 |   0.9878 |   0.001000 | ★ BEST
    8 |     0.0921 |     0.0221 |   0.9922 |   0.9922 |   0.001000 | ★ BEST
    9 |     0.0709 |     0.0165 |   0.9956 |   0.9956 |   0.001000 | ★ BEST
   10 |     0.0717 |     0.0161 |   0.9944 |   0.9944 |   0.001000 | 

  Best Dev F1-Macro: 0.99

In [7]:
# ============================================================
#  STAGE 5: FINAL TEST EVALUATION & SAVE
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 5: FINAL TEST EVALUATION & MODEL SAVING")
print("=" * 65)

# --- Test evaluation ---
test_loss, test_f1, test_acc, test_preds, test_labels_arr = evaluate(
    model, test_loader, criterion, cfg.DEVICE
)

print(f"\n{'='*40}")
print(f"  FINAL TEST RESULTS")
print(f"{'='*40}")

if has_test_labels:
    print(f"  Test F1-Macro:  {test_f1:.4f}")
    print(f"  Test Accuracy:  {test_acc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(
        test_labels_arr, test_preds,
        target_names=["Not Ambiguous", "Ambiguous"],
        digits=4
    ))
    
    cm_test = confusion_matrix(test_labels_arr, test_preds)
    print(f"Confusion Matrix:")
    print(f"                  Pred_NotAmb  Pred_Amb")
    print(f"  True_NotAmb     {cm_test[0][0]:>8}     {cm_test[0][1]:>8}")
    print(f"  True_Amb        {cm_test[1][0]:>8}     {cm_test[1][1]:>8}")
else:
    print(f"  No test labels available — predictions saved only.")
    print(f"  Prediction distribution:")
    print(f"    Not Ambiguous (0): {(test_preds == 0).sum()}")
    print(f"    Ambiguous (1):     {(test_preds == 1).sum()}")

# --- Save predictions ---
df_test["predicted_label"] = test_preds

output_cols = ["id", "question", "norm_text", "predicted_label"]
if has_test_labels:
    output_cols.insert(3, "is_ambiguous")
df_test[output_cols].to_csv(
    os.path.join(cfg.SAVE_DIR, "lstm_test_predictions.csv"),
    index=False, encoding="utf-8-sig"
)

# Submission file
submission = df_test[["id", "predicted_label"]].copy()
submission.columns = ["id", "is_ambiguous"]
submission.to_csv(os.path.join(cfg.SAVE_DIR, "submission.csv"), index=False)

# --- Save model artifacts ---
# 1. Model weights (already saved as best_lstm.pt)
print(f"\n--- Saving Model Artifacts ---")

# 2. Vocabulary
vocab_path = os.path.join(cfg.SAVE_DIR, "vocab.pkl")
with open(vocab_path, "wb") as f:
    pickle.dump({
        "word2idx": word2idx,
        "idx2word": idx2word,
        "vocab_size": vocab_size
    }, f)
print(f"   Vocabulary saved: {vocab_path}")

# 3. Config
config_dict = {
    "max_len": cfg.MAX_LEN,
    "embed_dim": cfg.EMBED_DIM,
    "hidden_dim": cfg.HIDDEN_DIM,
    "num_layers": cfg.NUM_LAYERS,
    "dropout": cfg.DROPOUT,
    "bidirectional": cfg.BIDIRECTIONAL,
    "vocab_size": vocab_size,
    "pad_idx": PAD_IDX,
    "best_dev_f1": best_dev_f1,
}
if has_test_labels:
    config_dict["test_f1"] = test_f1
    config_dict["test_acc"] = test_acc

config_path = os.path.join(cfg.SAVE_DIR, "config.pkl")
with open(config_path, "wb") as f:
    pickle.dump(config_dict, f)
print(f"   Config saved: {config_path}")

# 4. Training history
history_path = os.path.join(cfg.SAVE_DIR, "training_history.pkl")
with open(history_path, "wb") as f:
    pickle.dump(history, f)
print(f"   Training history saved: {history_path}")

print(f"   Model weights: {os.path.join(cfg.SAVE_DIR, 'best_lstm.pt')}")
print(f"   Predictions: lstm_test_predictions.csv")
print(f"   Submission: submission.csv")

# --- Summary ---
print("\n" + "=" * 65)
print("  PIPELINE COMPLETE — SUMMARY")
print("=" * 65)
print(f"  Model:          BiLSTM (2-layer, hidden={cfg.HIDDEN_DIM})")
print(f"  Vocab size:     {vocab_size:,}")
print(f"  Max seq length: {cfg.MAX_LEN}")
print(f"  Best Dev F1:    {best_dev_f1:.4f}")
if has_test_labels:
    print(f"  Test F1-Macro:  {test_f1:.4f}")
    print(f"  Test Accuracy:  {test_acc:.4f}")
print(f"  Saved files:    best_lstm.pt, vocab.pkl, config.pkl, "
      f"training_history.pkl, submission.csv")
print("=" * 65)


  STAGE 5: FINAL TEST EVALUATION & MODEL SAVING

  FINAL TEST RESULTS
  Test F1-Macro:  0.9000
  Test Accuracy:  0.9000

Classification Report:
               precision    recall  f1-score   support

Not Ambiguous     0.9000    0.9000    0.9000        50
    Ambiguous     0.9000    0.9000    0.9000        50

     accuracy                         0.9000       100
    macro avg     0.9000    0.9000    0.9000       100
 weighted avg     0.9000    0.9000    0.9000       100

Confusion Matrix:
                  Pred_NotAmb  Pred_Amb
  True_NotAmb           45            5
  True_Amb               5           45

--- Saving Model Artifacts ---
   Vocabulary saved: /kaggle/working/vocab.pkl
   Config saved: /kaggle/working/config.pkl
   Training history saved: /kaggle/working/training_history.pkl
   Model weights: /kaggle/working/best_lstm.pt
   Predictions: lstm_test_predictions.csv
   Submission: submission.csv

  PIPELINE COMPLETE — SUMMARY
  Model:          BiLSTM (2-layer, hidden=128)
