In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/aabdollahii/university-questions/train.json
/kaggle/input/datasets/aabdollahii/university-questions/test.json


In [4]:
# ============================================================
#  FULL PIPELINE: LSTM for Persian Question Ambiguity Detection
# ============================================================
#  Stage 1: Preprocessing V2 (train + test)
#  Stage 2: Vocabulary & Dataset
#  Stage 3: LSTM Model Definition
#  Stage 4: Training with dev (train-as-dev) monitoring
#  Stage 5: Final test evaluation & model saving
# ============================================================

import json
import re
import os
import pickle
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix,
    f1_score, accuracy_score
)

from hazm import Normalizer, word_tokenize

# ============================================================
#  CONFIG
# ============================================================
class Config:
    # Paths
    TRAIN_PATH = "/kaggle/input/datasets/aabdollahii/university-questions/train.json"
    TEST_PATH = "/kaggle/input/datasets/aabdollahii/university-questions/test.json"
    SAVE_DIR = "/kaggle/working/"
    
    # Preprocessing
    MAX_LEN = 64           # max tokens per question (will verify from data)
    MIN_FREQ = 2           # min word frequency to include in vocab
    
    # Model
    EMBED_DIM = 128
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.3
    BIDIRECTIONAL = True
    
    # Training
    BATCH_SIZE = 32
    EPOCHS = 30
    LR = 1e-3
    WEIGHT_DECAY = 1e-5
    PATIENCE = 7           # early stopping patience
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Reproducibility
    SEED = 42

cfg = Config()

# Set seeds
torch.manual_seed(cfg.SEED)
np.random.seed(cfg.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(cfg.SEED)

print(f"Device: {cfg.DEVICE}")
print(f"PyTorch version: {torch.__version__}")

Device: cpu
PyTorch version: 2.8.0+cu126


In [None]:
# ============================================================
#  STAGE 1: PREPROCESSING V2
# ============================================================
print("\n" + "=" * 65)
print("  STAGE 1: PREPROCESSING V2")
print("=" * 65)

formal_normalizer = Normalizer()

def normalize_v2(text):
    """
    V2 normalization pipeline — safe version (no InformalNormalizer).
    1. Hazm formal normalization (handles ی/ک, spacing, etc.)
    2. Arabic char normalization
    3. Clean punctuation/extra whitespace
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Step 1: Hazm formal normalization
    text = formal_normalizer.normalize(text)
    
    # Step 2: Additional Arabic → Persian char normalization
    text = text.replace("ي", "ی").replace("ك", "ک")
    text = text.replace("ؤ", "و").replace("إ", "ا").replace("أ", "ا")
    text = text.replace("ة", "ه")
    
    # Step 3: Normalize various dashes and special chars
    text = re.sub(r'[ـ]+', '', text)              # remove kashida (tatweel)
    text = re.sub(r'[‌]+', ' ', text)              # replace ZWNJ with space (hazm handles most)
    
    # Step 4: Keep Persian/Arabic letters, digits, basic punctuation, spaces
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\uFB50-\uFDFF\uFE70-\uFEFF'
                  r'a-zA-Z0-9۰-۹٠-٩\s\.\?\!،؛]', ' ', text)
    
    # Step 5: Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def tokenize_text(text):
    """Tokenize using Hazm word_tokenize after normalization."""
    if not text:
        return []
    return word_tokenize(text)


# --- Load Data ---
print("Loading train.json ...")
with open(cfg.TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)
df_train = pd.DataFrame(train_data)
print(f"  Train shape: {df_train.shape}")
print(f"  Label distribution:\n{df_train['is_ambiguous'].value_counts().to_string()}")

print("\nLoading test.json ...")
with open(cfg.TEST_PATH, "r", encoding="utf-8") as f:
    test_data = json.load(f)
df_test = pd.DataFrame(test_data)
print(f"  Test shape: {df_test.shape}")
has_test_labels = "is_ambiguous" in df_test.columns
if has_test_labels:
    print(f"  Test label distribution:\n{df_test['is_ambiguous'].value_counts().to_string()}")

# --- Apply Normalization ---
print("\nNormalizing train questions ...")
df_train["norm_text"] = df_train["question"].apply(normalize_v2)
print("Normalizing test questions ...")
df_test["norm_text"] = df_test["question"].apply(normalize_v2)

# --- Tokenize ---
print("Tokenizing train ...")
df_train["tokens"] = df_train["norm_text"].apply(tokenize_text)
print("Tokenizing test ...")
df_test["tokens"] = df_test["norm_text"].apply(tokenize_text)

# --- Show Samples ---
print("\n--- Train Samples ---")
for i in range(5):
    print(f"  [{df_train['is_ambiguous'].iloc[i]}] {df_train['question'].iloc[i]}")
    print(f"       → {df_train['tokens'].iloc[i][:15]} ...")
    print()

# --- Sequence Length Analysis ---
train_lengths = df_train["tokens"].apply(len)
print(f"Token length stats (train):")
print(f"  Mean:   {train_lengths.mean():.1f}")
print(f"  Median: {train_lengths.median():.1f}")
print(f"  95th %: {train_lengths.quantile(0.95):.0f}")
print(f"  99th %: {train_lengths.quantile(0.99):.0f}")
print(f"  Max:    {train_lengths.max()}")

# Update MAX_LEN based on data (cover 95th percentile)
suggested_max_len = int(train_lengths.quantile(0.95)) + 2
if suggested_max_len != cfg.MAX_LEN:
    print(f"\n   Updating MAX_LEN: {cfg.MAX_LEN} → {suggested_max_len}")
    cfg.MAX_LEN = suggested_max_len


  STAGE 1: PREPROCESSING V2
Loading train.json ...
  Train shape: (900, 4)
  Label distribution:
is_ambiguous
0    450
1    450

Loading test.json ...
  Test shape: (100, 4)
  Test label distribution:
is_ambiguous
0    50
1    50

Normalizing train questions ...
Normalizing test questions ...
Tokenizing train ...
