# BIP v10.10 - Bond Invariance Probe

**Key changes in v10.10:**
- Expanded Sanskrit corpus (~121 unique passages, deduplicated)
- Expanded Pali corpus (~90 unique passages, deduplicated)
- Expanded Buddhist Chinese corpus (~109 passages)
- Role-aware data augmentation to improve agent/patient sensitivity
- Role contrastive loss in training (addresses weak role_swap from fuzz testing)

Run all cells in order. Requires GPU runtime.

In [None]:
# @title 1. Configuration & Setup { display-mode: "form" }
# @markdown ## Data Source Configuration

DATA_MODE = "Update missing"  # @param ["Refresh all", "Update missing", "Cache only"]
# @markdown - **Refresh all**: Re-download everything from source (slow, ~2hrs)
# @markdown - **Update missing**: Use cache, download only what's missing (recommended)
# @markdown - **Cache only**: Use only cached data, fail if missing

DRIVE_FOLDER = "BIP_v10"  # @param {type:"string"}
# @markdown Folder name for persistent storage

# Derive flags from DATA_MODE
USE_DRIVE_DATA = True  # Always use Drive for caching
REFRESH_DATA_FROM_SOURCE = DATA_MODE == "Refresh all"
CACHE_ONLY = DATA_MODE == "Cache only"
# @markdown ---
# @markdown ## Model Backbone
BACKBONE = "MiniLM"  # @param ["MiniLM", "LaBSE", "XLM-R-base", "XLM-R-large"]
# @markdown - **MiniLM**: Fast, 118M params, good baseline
# @markdown - **LaBSE**: Best cross-lingual alignment, 471M params (recommended)
# @markdown - **XLM-R-base**: Strong multilingual, 270M params
# @markdown - **XLM-R-large**: Strongest representations, 550M params

# @markdown ---
# @markdown ## Output Options
CREATE_DOWNLOAD_ZIP = False  # @param {type:"boolean"}
# @markdown - **CREATE_DOWNLOAD_ZIP**: Create and download a zip file of results (optional)
# @markdown - Results are always persisted to Google Drive regardless of this setting

# Backbone configurations
BACKBONE_CONFIGS = {
    "MiniLM": {
        "model_name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "hidden_size": 384,
        "recommended_batch": {
            "L4/A100": 512,
            "T4": 256,
            "2xT4": 512,
            "SMALL": 128,
            "MINIMAL/CPU": 64,
        },
    },
    "LaBSE": {
        "model_name": "sentence-transformers/LaBSE",
        "hidden_size": 768,
        "recommended_batch": {
            "L4/A100": 256,
            "T4": 128,
            "2xT4": 256,
            "SMALL": 64,
            "MINIMAL/CPU": 32,
        },
    },
    "XLM-R-base": {
        "model_name": "xlm-roberta-base",
        "hidden_size": 768,
        "recommended_batch": {
            "L4/A100": 256,
            "T4": 128,
            "2xT4": 256,
            "SMALL": 64,
            "MINIMAL/CPU": 32,
        },
    },
    "XLM-R-large": {
        "model_name": "xlm-roberta-large",
        "hidden_size": 1024,
        "recommended_batch": {
            "L4/A100": 128,
            "T4": 64,
            "2xT4": 128,
            "SMALL": 32,
            "MINIMAL/CPU": 16,
        },
    },
}

BACKBONE_CONFIG = BACKBONE_CONFIGS[BACKBONE]
MODEL_NAME = BACKBONE_CONFIG["model_name"]
BACKBONE_HIDDEN = BACKBONE_CONFIG["hidden_size"]


# @markdown ---
# @markdown ## Run Setup

import time
import os
import sys

EXPERIMENT_START = time.time()

print("=" * 60)
print("BIP v10.9 - ENVIRONMENT DETECTION")
print("=" * 60)

# ===== ENVIRONMENT DETECTION =====
# Detect which cloud platform we're running on

ENV_NAME = "UNKNOWN"
ENV_GPU_QUOTA = "Unknown"
PERSISTENT_STORAGE = None
DATA_DIR = "/content"  # Default


def detect_environment():
    """Detect cloud environment and return (name, gpu_quota, storage_path, data_dir)"""

    # 1. Google Colab
    try:
        import google.colab

        return ("COLAB", "Free: T4 ~12h/day, Pro: L4/A100", "/content/drive/MyDrive", "/content")
    except ImportError:
        pass

    # 2. Kaggle Kernels
    if os.path.exists("/kaggle"):
        # Kaggle has /kaggle/input for datasets, /kaggle/working for output
        return ("KAGGLE", "Free: 2xT4 30h/week, TPU 30h/week", "/kaggle/working", "/kaggle/working")

    # 3. Lightning.ai Studios
    if os.environ.get("LIGHTNING_CLOUDSPACE_HOST") or os.path.exists("/teamspace"):
        # Lightning.ai has /teamspace/studios for persistent storage
        return (
            "LIGHTNING_AI",
            "Free: 22h/month GPU, Pro: A10G/H100",
            "/teamspace/studios",
            "/teamspace/studios",
        )

    # 4. Paperspace Gradient
    if os.environ.get("PAPERSPACE_NOTEBOOK_REPO_ID") or os.path.exists("/notebooks"):
        return ("PAPERSPACE", "Free: M4000 6h, Pro: A100/H100", "/storage", "/notebooks")

    # 5. Saturn Cloud
    if os.environ.get("SATURN_RESOURCE_ID") or "saturn" in os.environ.get("HOSTNAME", "").lower():
        return (
            "SATURN_CLOUD",
            "Free: T4 10h/month, Pro: A10G/A100",
            "/home/jovyan/workspace",
            "/home/jovyan",
        )

    # 6. HuggingFace Spaces
    if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
        return (
            "HUGGINGFACE_SPACES",
            "Free: CPU only, ZeroGPU: A10G/A100 quota",
            "/data",
            "/home/user/app",
        )

    # 7. AWS SageMaker Studio Lab
    if os.path.exists("/home/studio-lab-user"):
        return (
            "SAGEMAKER_STUDIO_LAB",
            "Free: T4 4h/session, 24h max/day",
            "/home/studio-lab-user",
            "/home/studio-lab-user",
        )

    # 8. Deepnote
    if os.environ.get("DEEPNOTE_PROJECT_ID"):
        return ("DEEPNOTE", "Free: CPU, Pro: T4/A10G", "/work", "/work")

    # 9. Local/Unknown
    return ("LOCAL", "Depends on local hardware", os.getcwd(), os.getcwd())


ENV_NAME, ENV_GPU_QUOTA, PERSISTENT_STORAGE, DATA_DIR = detect_environment()

print(f"\nEnvironment: {ENV_NAME}")
print(f"GPU Quota:   {ENV_GPU_QUOTA}")
print(f"Storage:     {PERSISTENT_STORAGE}")
print(f"Data Dir:    {DATA_DIR}")

# Environment-specific setup
ENV_TIPS = {
    "COLAB": [
        "Tip: Use GPU runtime (Runtime -> Change runtime type -> T4 GPU)",
        "Tip: Colab Pro gives L4 GPU access (~2x faster than T4)",
    ],
    "KAGGLE": [
        "Tip: Enable GPU (Settings -> Accelerator -> GPU T4 x2)",
        "Tip: 30h/week GPU quota resets every Saturday",
        "Tip: Upload data as a Kaggle Dataset for persistence",
    ],
    "LIGHTNING_AI": [
        "Tip: Select GPU studio (A10G recommended for this workload)",
        "Tip: /teamspace/studios persists across sessions",
    ],
    "PAPERSPACE": [
        "Tip: Use /storage for persistent data across runs",
        "Tip: Free tier has 6h/month GPU limit",
    ],
    "SATURN_CLOUD": [
        "Tip: Start a T4 instance from the Resources tab",
        "Tip: 10h/month free GPU quota",
    ],
    "HUGGINGFACE_SPACES": [
        "Tip: ZeroGPU provides A10G/A100 access with quota system",
        "Tip: Use Gradio/Streamlit for interactive demos",
    ],
    "SAGEMAKER_STUDIO_LAB": [
        "Tip: Request GPU runtime from the launcher",
        "Tip: Sessions timeout after 4h, max 24h/day",
    ],
    "LOCAL": ["Tip: Running locally - ensure CUDA is installed for GPU support"],
}

print(f"\n" + "-" * 60)
print("ENVIRONMENT TIPS:")
for tip in ENV_TIPS.get(ENV_NAME, ["No specific tips for this environment"]):
    print(f"  {tip}")
print("-" * 60)

# ===== INSTALL DEPENDENCIES =====
import subprocess

print("\nInstalling dependencies...")
for pkg in [
    "transformers",
    "sentence-transformers",
    "pandas",
    "tqdm",
    "scikit-learn",
    "pyyaml",
    "psutil",
    "datasets",
]:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

import torch
import psutil

print("\n" + "=" * 60)
print("GPU DETECTION & RESOURCE ALLOCATION")
print("=" * 60)

# Detect hardware
if torch.cuda.is_available():
    GPU_NAME = torch.cuda.get_device_name(0)
    VRAM_GB = torch.cuda.get_device_properties(0).total_memory / 1e9
    GPU_COUNT = torch.cuda.device_count()
else:
    GPU_NAME = "CPU"
    VRAM_GB = 0
    GPU_COUNT = 0

RAM_GB = psutil.virtual_memory().total / 1e9

print(f"\nDetected Hardware:")
print(f"  GPU:  {GPU_NAME}" + (f" (x{GPU_COUNT})" if GPU_COUNT > 1 else ""))
print(
    f"  VRAM: {VRAM_GB:.1f} GB" + (f" (total: {VRAM_GB*GPU_COUNT:.1f} GB)" if GPU_COUNT > 1 else "")
)
print(f"  RAM:  {RAM_GB:.1f} GB")

# Set optimal parameters based on hardware
if VRAM_GB >= 22:  # L4 (24GB) or A100
    GPU_TIER = "L4/A100"
elif VRAM_GB >= 14:  # T4 (16GB)
    GPU_TIER = "T4"
elif VRAM_GB >= 10:
    GPU_TIER = "SMALL"
else:
    GPU_TIER = "MINIMAL/CPU"

# Kaggle with 2xT4 can use larger batch
if ENV_NAME == "KAGGLE" and GPU_COUNT >= 2:
    GPU_TIER = "2xT4"
    print(f"  ** Kaggle 2xT4 detected **")

# Get backbone-specific batch size
BATCH_SIZE = BACKBONE_CONFIG["recommended_batch"].get(GPU_TIER, 64)
print(f"  Backbone: {BACKBONE} -> batch size {BATCH_SIZE}")

MAX_PER_LANG = 50000  # Language sample limit
CPU_CORES = os.cpu_count() or 2
NUM_WORKERS = min(4, CPU_CORES - 1) if RAM_GB >= 24 and VRAM_GB >= 14 else 0
MAX_TEST_SAMPLES = 20000
LR = 2e-5 * (BATCH_SIZE / 256)

print(f"\n" + "-" * 60)
print(f"OPTIMAL SETTINGS:")
print(f"-" * 60)
print(f"  Environment:     {ENV_NAME}")
print(f"  GPU Tier:        {GPU_TIER}")
print(f"  Backbone:        {BACKBONE}")
print(f"  Batch size:      {BATCH_SIZE}")
print(f"  Max per lang:    {MAX_PER_LANG:,}")
print(f"  DataLoader workers: {NUM_WORKERS}")
print(f"  Learning rate:   {LR:.2e}")

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_AMP = torch.cuda.is_available()
scaler = torch.amp.GradScaler("cuda") if USE_AMP else None

# ===== PERSISTENT STORAGE SETUP =====
print("\n" + "=" * 60)
print("PERSISTENT STORAGE SETUP")
print("=" * 60)

SAVE_DIR = None
DRIVE_HAS_DATA = False
DRIVE_FILES = set()  # Use set for O(1) lookup

if ENV_NAME == "COLAB":
    # Google Colab - mount Drive
    try:
        from google.colab import drive

        DRIVE_MOUNT_PATH = "/content/drive"

        if os.path.exists(f"{DRIVE_MOUNT_PATH}/MyDrive"):
            print("Google Drive already mounted")
        else:
            try:
                drive.mount(DRIVE_MOUNT_PATH, force_remount=False)
                print("Google Drive mounted successfully")
            except Exception as e:
                print(f"Drive mount issue: {e}")
                try:
                    drive.mount(DRIVE_MOUNT_PATH, force_remount=True)
                    print("Google Drive mounted (force remount)")
                except Exception as e2:
                    print(f"WARNING: Could not mount Drive: {e2}")
                    print("Falling back to local storage")
                    PERSISTENT_STORAGE = DATA_DIR

        SAVE_DIR = f"{DRIVE_MOUNT_PATH}/MyDrive/{DRIVE_FOLDER}"
    except Exception as e:
        print(f"Colab Drive setup failed: {e}")
        SAVE_DIR = f"{DATA_DIR}/{DRIVE_FOLDER}"

elif ENV_NAME == "KAGGLE":
    # Kaggle - use working directory
    SAVE_DIR = f"{PERSISTENT_STORAGE}/{DRIVE_FOLDER}"
    print(f"Using Kaggle working directory: {SAVE_DIR}")
    print("Note: Data persists until kernel is reset")
    # Check for uploaded datasets
    if os.path.exists("/kaggle/input"):
        datasets = os.listdir("/kaggle/input")
        if datasets:
            print(f"Available datasets: {datasets[:5]}")

elif ENV_NAME == "LIGHTNING_AI":
    SAVE_DIR = f"{PERSISTENT_STORAGE}/{DRIVE_FOLDER}"
    print(f"Using Lightning.ai studio storage: {SAVE_DIR}")

elif ENV_NAME == "PAPERSPACE":
    SAVE_DIR = f"{PERSISTENT_STORAGE}/{DRIVE_FOLDER}"
    print(f"Using Paperspace /storage: {SAVE_DIR}")

elif ENV_NAME == "HUGGINGFACE_SPACES":
    # HF Spaces has limited persistent storage
    SAVE_DIR = f"{PERSISTENT_STORAGE}/{DRIVE_FOLDER}"
    print(f"Using HuggingFace Spaces storage: {SAVE_DIR}")
    print("Warning: HF Spaces storage is limited")

else:
    SAVE_DIR = f"{PERSISTENT_STORAGE}/{DRIVE_FOLDER}"
    print(f"Using local storage: {SAVE_DIR}")

# Check if folder exists BEFORE creating it
folder_existed = os.path.exists(SAVE_DIR)
os.makedirs(SAVE_DIR, exist_ok=True)

# Check what's available in storage - use BOTH listdir AND direct exists checks
# (Google Drive can have sync issues where listdir misses files)
if os.path.exists(SAVE_DIR):
    DRIVE_FILES = set(os.listdir(SAVE_DIR))  # O(1) membership test

    # Direct existence checks for key files (bypasses listdir caching issues)
    key_files = ["passages.jsonl", "bonds.jsonl", "dear_abby.csv", "all_splits.json"]
    for kf in key_files:
        kf_path = os.path.join(SAVE_DIR, kf)
        if os.path.exists(kf_path) and kf not in DRIVE_FILES:
            print(f"  [Drive sync fix] Found {kf} via os.path.exists() but not listdir()")
            DRIVE_FILES.add(kf)

    DRIVE_HAS_DATA = "passages.jsonl" in DRIVE_FILES and "bonds.jsonl" in DRIVE_FILES

print(f"\n" + "-" * 60)
print(f"STORAGE STATUS:")
print(f"-" * 60)
print(f"  Folder: {SAVE_DIR}")
print(f"  Folder existed: {folder_existed}")
print(f"  Files found: {len(DRIVE_FILES)}")

# If folder was empty/new, show what folders exist in parent to help debug
if not DRIVE_FILES and ENV_NAME == "COLAB":
    parent = os.path.dirname(SAVE_DIR)  # e.g., /content/drive/MyDrive
    if os.path.exists(parent):
        siblings = [d for d in os.listdir(parent) if "bip" in d.lower() or "BIP" in d]
        if siblings:
            print(f"  ** Similar folders in {parent}: {siblings}")
        else:
            print(f"  ** No BIP folders found in {parent}")
if DRIVE_FILES:
    for f in sorted(DRIVE_FILES)[:10]:  # sorted() converts to list for slicing
        print(f"    - {f}")
    if len(DRIVE_FILES) > 10:
        print(f"    ... and {len(DRIVE_FILES)-10} more")
print(f"  Pre-processed data available: {DRIVE_HAS_DATA}")

# Decide data loading strategy
LOAD_FROM_DRIVE = USE_DRIVE_DATA and DRIVE_HAS_DATA and not REFRESH_DATA_FROM_SOURCE

print(f"\n" + "=" * 60)
print(f"DATA LOADING STRATEGY: {DATA_MODE}")
print("-" * 60)
if DATA_MODE == "Refresh all":
    print(f"  -> Will re-download ALL data from online sources")
    print(f"     (This takes ~2 hours, use 'Update missing' to save time)")
elif DATA_MODE == "Cache only":
    if LOAD_FROM_DRIVE:
        print(f"  -> Using cached data only (no downloads)")
    else:
        print(f"  -> ERROR: Cache-only mode but no cached data found!")
        print(f"     Change DATA_MODE to 'Update missing'")
else:  # Update missing (default)
    if LOAD_FROM_DRIVE:
        print(f"  -> Using cached processed data from Drive")
        print(f"     (v10.9 corpora will be added if missing)")
    else:
        print(f"  -> Will download missing data, use cached where available")
        print(
            f"     Sefaria: {'cached' if os.path.exists(f'{SAVE_DIR}/Sefaria-Export-json.tar.gz') else 'will download'}"
        )
print("=" * 60)

# Create local directories
for d in ["data/processed", "data/splits", "data/raw", "models/checkpoints", "results"]:
    os.makedirs(d, exist_ok=True)

print(f"\n" + "=" * 60)
print(f"SETUP COMPLETE")
print(f"=" * 60)
print(f"  Environment: {ENV_NAME}")
print(f"  GPU:         {GPU_NAME} ({GPU_TIER})")
print(f"  Storage:     {SAVE_DIR}")
print(f"  Ready to run: Cell 2 (Imports)")


In [None]:
# @title 2. Download/Load Corpora { display-mode: "form" }
# @markdown Downloads from online sources OR loads from Google Drive

import subprocess
import json
import pandas as pd
import shutil
from pathlib import Path

print("=" * 60)
print("LOADING CORPORA")
print("=" * 60)

# Force Google Drive sync refresh (workaround for stale FUSE mount)
if ENV_NAME == "COLAB" and SAVE_DIR and os.path.exists(os.path.dirname(SAVE_DIR)):
    try:
        # Accessing the directory forces FUSE to refresh
        _ = os.listdir(SAVE_DIR)
        # Also touch parent to wake up sync
        _ = os.listdir(os.path.dirname(SAVE_DIR))
        print("  [Drive sync refreshed]")
    except Exception as e:
        print(f"  [Drive sync warning: {e}]")

if LOAD_FROM_DRIVE:
    # ===== LOAD FROM DRIVE =====
    print("\nLoading pre-processed data from Google Drive...")

    # Copy files from Drive to local
    for fname in ["passages.jsonl", "bonds.jsonl"]:
        src = f"{SAVE_DIR}/{fname}"
        dst = f"data/processed/{fname}"
        if os.path.exists(src):
            shutil.copy(src, dst)
            print(f"  Copied {fname}")

    if os.path.exists(f"{SAVE_DIR}/all_splits.json"):
        shutil.copy(f"{SAVE_DIR}/all_splits.json", "data/splits/all_splits.json")
        print(f"  Copied all_splits.json")

    # Load Dear Abby from Drive if available (check filesystem, not cached set)
    abby_drive_path = f"{SAVE_DIR}/dear_abby.csv"
    if os.path.exists(abby_drive_path):
        shutil.copy(abby_drive_path, "data/raw/dear_abby.csv")
        print(f"  Copied dear_abby.csv from {abby_drive_path}")

    # Count loaded data
    if os.path.exists("data/processed/passages.jsonl"):
        with open("data/processed/passages.jsonl") as f:
            n_passages = sum(1 for _ in f)
        print(f"\nLoaded {n_passages:,} passages from Drive")

    SKIP_PROCESSING = True
    print("\n" + "=" * 60)
    print("Drive data loaded - skipping download/processing")
    print("=" * 60)

else:
    # ===== DOWNLOAD/UPDATE FROM ONLINE =====
    SKIP_PROCESSING = False

    # Check if CACHE_ONLY mode but cache is missing
    if CACHE_ONLY:
        print("\n" + "=" * 60)
        print("ERROR: CACHE_ONLY mode but cached data not found!")
        print("=" * 60)
        print("Options:")
        print("  1. Change DATA_MODE to 'Update missing' or 'Refresh all'")
        print("  2. Ensure Drive has: passages.jsonl, bonds.jsonl")
        raise RuntimeError("Cache-only mode requires cached data. Change DATA_MODE.")

    # SEFARIA - with Drive caching
    sefaria_local = "data/raw/Sefaria-Export/json"
    sefaria_drive = f"{SAVE_DIR}/Sefaria-Export-json.tar.gz" if USE_DRIVE_DATA else None

    if os.path.exists(sefaria_local):
        print("\n[1/4] Sefaria already exists locally")
    elif sefaria_drive and os.path.exists(sefaria_drive):
        print("\n[1/4] Restoring Sefaria from Drive cache...")
        import tarfile

        os.makedirs("data/raw/Sefaria-Export", exist_ok=True)
        with tarfile.open(sefaria_drive, "r:gz") as tar:
            tar.extractall("data/raw/Sefaria-Export")
        print("  Restored from Drive!")
    else:
        print("\n[1/4] Downloading Sefaria (~2GB)...")
        subprocess.run(
            [
                "git",
                "clone",
                "--depth",
                "1",
                "https://github.com/Sefaria/Sefaria-Export.git",
                "data/raw/Sefaria-Export",
            ],
            check=True,
        )
        print("  Done!")
        # Cache to Drive for next time
        if USE_DRIVE_DATA and SAVE_DIR:
            print("  Caching Sefaria to Drive (this may take a minute)...")
            import tarfile

            with tarfile.open(sefaria_drive, "w:gz") as tar:
                tar.add("data/raw/Sefaria-Export/json", arcname="json")
            print(f"  Cached to {sefaria_drive}")

    # CHINESE - 200+ REAL CLASSICAL TEXTS
    print("\n[2/4] Chinese classics (200+ real passages)...")
    os.makedirs("data/raw/chinese", exist_ok=True)

    chinese = []

    # === ANALECTS (論語) - 50+ passages ===
    analects = [
        ("子曰：己所不欲，勿施於人。", "Analects 15.24"),
        ("孝悌也者，其為仁之本與。", "Analects 1.2"),
        ("父母在，不遠游，遊必有方。", "Analects 4.19"),
        ("君子喻於義，小人喻於利。", "Analects 4.16"),
        ("不義而富且貴，於我如浮雲。", "Analects 7.16"),
        ("學而時習之，不亦說乎。", "Analects 1.1"),
        ("有朋自遠方來，不亦樂乎。", "Analects 1.1"),
        ("人不知而不慍，不亦君子乎。", "Analects 1.1"),
        ("巧言令色，鮮矣仁。", "Analects 1.3"),
        ("吾日三省吾身。", "Analects 1.4"),
        ("為人謀而不忠乎，與朋友交而不信乎。", "Analects 1.4"),
        ("弟子入則孝，出則悌。", "Analects 1.6"),
        ("謹而信，汎愛眾，而親仁。", "Analects 1.6"),
        ("君子不重則不威，學則不固。", "Analects 1.8"),
        ("主忠信，無友不如己者。", "Analects 1.8"),
        ("過則勿憚改。", "Analects 1.8"),
        ("慎終追遠，民德歸厚矣。", "Analects 1.9"),
        ("禮之用，和為貴。", "Analects 1.12"),
        ("信近於義，言可復也。", "Analects 1.13"),
        ("君子食無求飽，居無求安。", "Analects 1.14"),
        ("敏於事而慎於言，就有道而正焉。", "Analects 1.14"),
        ("不患人之不己知，患不知人也。", "Analects 1.16"),
        ("為政以德，譬如北辰。", "Analects 2.1"),
        ("道之以政，齊之以刑，民免而無恥。", "Analects 2.3"),
        ("道之以德，齊之以禮，有恥且格。", "Analects 2.3"),
        ("吾十有五而志于學。", "Analects 2.4"),
        ("三十而立，四十而不惑。", "Analects 2.4"),
        ("五十而知天命，六十而耳順。", "Analects 2.4"),
        ("七十而從心所欲，不逾矩。", "Analects 2.4"),
        ("生，事之以禮；死，葬之以禮，祭之以禮。", "Analects 2.5"),
        ("父母唯其疾之憂。", "Analects 2.6"),
        ("今之孝者，是謂能養。", "Analects 2.7"),
        ("至於犬馬，皆能有養；不敬，何以別乎。", "Analects 2.7"),
        ("色難。有事，弟子服其勞。", "Analects 2.8"),
        ("視其所以，觀其所由，察其所安。", "Analects 2.10"),
        ("溫故而知新，可以為師矣。", "Analects 2.11"),
        ("君子不器。", "Analects 2.12"),
        ("先行其言而後從之。", "Analects 2.13"),
        ("君子周而不比，小人比而不周。", "Analects 2.14"),
        ("學而不思則罔，思而不學則殆。", "Analects 2.15"),
        ("知之為知之，不知為不知，是知也。", "Analects 2.17"),
        ("多聞闕疑，慎言其餘，則寡尤。", "Analects 2.18"),
        ("舉直錯諸枉，則民服。", "Analects 2.19"),
        ("人而無信，不知其可也。", "Analects 2.22"),
        ("見義不為，無勇也。", "Analects 2.24"),
        ("非其鬼而祭之，諂也。", "Analects 2.24"),
        ("是可忍也，孰不可忍也。", "Analects 3.1"),
        ("人而不仁，如禮何。", "Analects 3.3"),
        ("人而不仁，如樂何。", "Analects 3.3"),
        ("里仁為美。擇不處仁，焉得知。", "Analects 4.1"),
        ("不仁者不可以久處約，不可以長處樂。", "Analects 4.2"),
        ("仁者安仁，知者利仁。", "Analects 4.2"),
        ("唯仁者能好人，能惡人。", "Analects 4.3"),
        ("苟志於仁矣，無惡也。", "Analects 4.4"),
    ]
    for i, (text, source) in enumerate(analects):
        chinese.append(
            {
                "id": f"cn_analects_{i}",
                "text": text,
                "source": source,
                "period": "CONFUCIAN",
                "century": -5,
            }
        )
    print(f"    - Analects: {len([x for x in chinese if 'analects' in x['id']]):,} passages")

    # === MENCIUS (孟子) - 40+ passages ===
    mencius = [
        ("惻隱之心，仁之端也。", "Mencius 2A.6"),
        ("羞惡之心，義之端也。", "Mencius 2A.6"),
        ("辭讓之心，禮之端也。", "Mencius 2A.6"),
        ("是非之心，智之端也。", "Mencius 2A.6"),
        ("人皆有不忍人之心。", "Mencius 2A.6"),
        ("無惻隱之心，非人也。", "Mencius 2A.6"),
        ("無羞惡之心，非人也。", "Mencius 2A.6"),
        ("無辭讓之心，非人也。", "Mencius 2A.6"),
        ("無是非之心，非人也。", "Mencius 2A.6"),
        ("仁義禮智，非由外鑠我也，我固有之也。", "Mencius 6A.6"),
        ("人性之善也，猶水之就下也。", "Mencius 6A.2"),
        ("人無有不善，水無有不下。", "Mencius 6A.2"),
        ("惟仁者宜在高位。", "Mencius 4A.1"),
        ("不仁而在高位，是播其惡於眾也。", "Mencius 4A.1"),
        ("民為貴，社稷次之，君為輕。", "Mencius 7B.14"),
        ("得道者多助，失道者寡助。", "Mencius 2B.1"),
        ("寡助之至，親戚畔之。", "Mencius 2B.1"),
        ("多助之至，天下順之。", "Mencius 2B.1"),
        ("天時不如地利，地利不如人和。", "Mencius 2B.1"),
        ("生於憂患，死於安樂。", "Mencius 6B.15"),
        ("天將降大任於是人也，必先苦其心志。", "Mencius 6B.15"),
        ("勞其筋骨，餓其體膚。", "Mencius 6B.15"),
        ("空乏其身，行拂亂其所為。", "Mencius 6B.15"),
        ("所以動心忍性，曾益其所不能。", "Mencius 6B.15"),
        ("老吾老，以及人之老。", "Mencius 1A.7"),
        ("幼吾幼，以及人之幼。", "Mencius 1A.7"),
        ("窮則獨善其身，達則兼善天下。", "Mencius 7A.9"),
        ("魚，我所欲也；熊掌，亦我所欲也。", "Mencius 6A.10"),
        ("二者不可得兼，舍魚而取熊掌者也。", "Mencius 6A.10"),
        ("生，亦我所欲也；義，亦我所欲也。", "Mencius 6A.10"),
        ("二者不可得兼，舍生而取義者也。", "Mencius 6A.10"),
        ("養心莫善於寡欲。", "Mencius 7B.35"),
        ("仁者無敵於天下。", "Mencius 1A.5"),
        ("以力服人者，非心服也。", "Mencius 2A.3"),
        ("以德服人者，中心悅而誠服也。", "Mencius 2A.3"),
        ("人之患在好為人師。", "Mencius 4A.23"),
        ("盡信書，則不如無書。", "Mencius 7B.3"),
        ("不以規矩，不能成方圓。", "Mencius 4A.1"),
        ("孝子之至，莫大乎尊親。", "Mencius 5A.4"),
        ("父子有親，君臣有義，夫婦有別，長幼有序，朋友有信。", "Mencius 3A.4"),
        ("人有不為也，而後可以有為。", "Mencius 4B.8"),
    ]
    for i, (text, source) in enumerate(mencius):
        chinese.append(
            {
                "id": f"cn_mencius_{i}",
                "text": text,
                "source": source,
                "period": "CONFUCIAN",
                "century": -4,
            }
        )
    print(f"    - Mencius: {len([x for x in chinese if 'mencius' in x['id']]):,} passages")

    # === DAODEJING (道德經) - 40+ passages ===
    daodejing = [
        ("道可道，非常道。名可名，非常名。", "Daodejing 1"),
        ("天下皆知美之為美，斯惡已。", "Daodejing 2"),
        ("皆知善之為善，斯不善已。", "Daodejing 2"),
        ("有無相生，難易相成。", "Daodejing 2"),
        ("長短相較，高下相傾。", "Daodejing 2"),
        ("是以聖人處無為之事，行不言之教。", "Daodejing 2"),
        ("不尚賢，使民不爭。", "Daodejing 3"),
        ("不貴難得之貨，使民不為盜。", "Daodejing 3"),
        ("上善若水。水善利萬物而不爭。", "Daodejing 8"),
        ("處眾人之所惡，故幾於道。", "Daodejing 8"),
        ("居善地，心善淵，與善仁。", "Daodejing 8"),
        ("言善信，政善治，事善能，動善時。", "Daodejing 8"),
        ("夫唯不爭，故無尤。", "Daodejing 8"),
        ("金玉滿堂，莫之能守。", "Daodejing 9"),
        ("富貴而驕，自遺其咎。", "Daodejing 9"),
        ("功成身退，天之道也。", "Daodejing 9"),
        ("知人者智，自知者明。", "Daodejing 33"),
        ("勝人者有力，自勝者強。", "Daodejing 33"),
        ("知足者富，強行者有志。", "Daodejing 33"),
        ("不失其所者久，死而不亡者壽。", "Daodejing 33"),
        ("大道廢，有仁義。", "Daodejing 18"),
        ("智慧出，有大偽。", "Daodejing 18"),
        ("六親不和，有孝慈。", "Daodejing 18"),
        ("國家昏亂，有忠臣。", "Daodejing 18"),
        ("禍兮福之所倚，福兮禍之所伏。", "Daodejing 58"),
        ("天長地久。", "Daodejing 7"),
        ("天地所以能長且久者，以其不自生。", "Daodejing 7"),
        ("是以聖人後其身而身先。", "Daodejing 7"),
        ("外其身而身存。", "Daodejing 7"),
        ("非以其無私耶，故能成其私。", "Daodejing 7"),
        ("柔弱勝剛強。", "Daodejing 36"),
        ("大方無隅，大器晚成。", "Daodejing 41"),
        ("大音希聲，大象無形。", "Daodejing 41"),
        ("道生一，一生二，二生三，三生萬物。", "Daodejing 42"),
        ("天下萬物生於有，有生於無。", "Daodejing 40"),
        ("千里之行，始於足下。", "Daodejing 64"),
        ("合抱之木，生於毫末。", "Daodejing 64"),
        ("九層之臺，起於累土。", "Daodejing 64"),
        ("民不畏死，奈何以死懼之。", "Daodejing 74"),
        ("信言不美，美言不信。", "Daodejing 81"),
        ("善者不辯，辯者不善。", "Daodejing 81"),
        ("知者不博，博者不知。", "Daodejing 81"),
    ]
    for i, (text, source) in enumerate(daodejing):
        chinese.append(
            {
                "id": f"cn_daodejing_{i}",
                "text": text,
                "source": source,
                "period": "DAOIST",
                "century": -4,
            }
        )
    print(f"    - Daodejing: {len([x for x in chinese if 'daodejing' in x['id']]):,} passages")

    # === GREAT LEARNING (大學) - 20+ passages ===
    daxue = [
        ("大學之道，在明明德，在親民，在止於至善。", "Great Learning 1"),
        ("知止而後有定，定而後能靜。", "Great Learning 1"),
        ("靜而後能安，安而後能慮，慮而後能得。", "Great Learning 1"),
        ("物有本末，事有終始。", "Great Learning 1"),
        ("知所先後，則近道矣。", "Great Learning 1"),
        ("古之欲明明德於天下者，先治其國。", "Great Learning 1"),
        ("欲治其國者，先齊其家。", "Great Learning 1"),
        ("欲齊其家者，先修其身。", "Great Learning 1"),
        ("欲修其身者，先正其心。", "Great Learning 1"),
        ("欲正其心者，先誠其意。", "Great Learning 1"),
        ("欲誠其意者，先致其知。", "Great Learning 1"),
        ("致知在格物。", "Great Learning 1"),
        ("物格而後知至，知至而後意誠。", "Great Learning 1"),
        ("意誠而後心正，心正而後身修。", "Great Learning 1"),
        ("身修而後家齊，家齊而後國治。", "Great Learning 1"),
        ("國治而後天下平。", "Great Learning 1"),
        ("自天子以至於庶人，壹是皆以修身為本。", "Great Learning 1"),
        ("其本亂而末治者否矣。", "Great Learning 1"),
        ("所謂誠其意者，毋自欺也。", "Great Learning 6"),
        ("如惡惡臭，如好好色，此之謂自謙。", "Great Learning 6"),
        ("故君子必慎其獨也。", "Great Learning 6"),
        ("富潤屋，德潤身，心廣體胖。", "Great Learning 6"),
    ]
    for i, (text, source) in enumerate(daxue):
        chinese.append(
            {
                "id": f"cn_daxue_{i}",
                "text": text,
                "source": source,
                "period": "CONFUCIAN",
                "century": -5,
            }
        )
    print(f"    - Great Learning: {len([x for x in chinese if 'daxue' in x['id']]):,} passages")

    # === DOCTRINE OF THE MEAN (中庸) - 20+ passages ===
    zhongyong = [
        ("天命之謂性，率性之謂道，修道之謂教。", "Doctrine of the Mean 1"),
        ("道也者，不可須臾離也；可離，非道也。", "Doctrine of the Mean 1"),
        ("是故君子戒慎乎其所不睹，恐懼乎其所不聞。", "Doctrine of the Mean 1"),
        ("莫見乎隱，莫顯乎微，故君子慎其獨也。", "Doctrine of the Mean 1"),
        ("喜怒哀樂之未發，謂之中。", "Doctrine of the Mean 1"),
        ("發而皆中節，謂之和。", "Doctrine of the Mean 1"),
        ("中也者，天下之大本也。", "Doctrine of the Mean 1"),
        ("和也者，天下之達道也。", "Doctrine of the Mean 1"),
        ("致中和，天地位焉，萬物育焉。", "Doctrine of the Mean 1"),
        ("君子中庸，小人反中庸。", "Doctrine of the Mean 2"),
        ("君子之中庸也，君子而時中。", "Doctrine of the Mean 2"),
        ("小人之反中庸也，小人而無忌憚也。", "Doctrine of the Mean 2"),
        ("中庸其至矣乎！民鮮能久矣。", "Doctrine of the Mean 3"),
        ("道之不行也，我知之矣：知者過之，愚者不及也。", "Doctrine of the Mean 4"),
        ("道之不明也，我知之矣：賢者過之，不肖者不及也。", "Doctrine of the Mean 4"),
        ("人莫不飲食也，鮮能知味也。", "Doctrine of the Mean 4"),
        ("誠者，天之道也。誠之者，人之道也。", "Doctrine of the Mean 20"),
        ("誠者，不勉而中，不思而得，從容中道，聖人也。", "Doctrine of the Mean 20"),
        ("誠之者，擇善而固執之者也。", "Doctrine of the Mean 20"),
        ("博學之，審問之，慎思之，明辨之，篤行之。", "Doctrine of the Mean 20"),
        ("人一能之，己百之；人十能之，己千之。", "Doctrine of the Mean 20"),
        ("果能此道矣，雖愚必明，雖柔必強。", "Doctrine of the Mean 20"),
    ]
    for i, (text, source) in enumerate(zhongyong):
        chinese.append(
            {
                "id": f"cn_zhongyong_{i}",
                "text": text,
                "source": source,
                "period": "CONFUCIAN",
                "century": -5,
            }
        )
    print(
        f"    - Doctrine of Mean: {len([x for x in chinese if 'zhongyong' in x['id']]):,} passages"
    )

    # === BOOK OF RITES (禮記) - 30+ passages ===
    liji = [
        ("禮尚往來。往而不來，非禮也；來而不往，亦非禮也。", "Book of Rites - Quli"),
        ("敖不可長，欲不可從，志不可滿，樂不可極。", "Book of Rites - Quli"),
        ("臨財毋茍得，臨難毋茍免。", "Book of Rites - Quli"),
        ("夫禮者，自卑而尊人。", "Book of Rites - Quli"),
        ("雖負販者，必有尊也，而況富貴乎。", "Book of Rites - Quli"),
        ("富貴而知好禮，則不驕不淫。", "Book of Rites - Quli"),
        ("貧賤而知好禮，則志不懾。", "Book of Rites - Quli"),
        ("大道之行也，天下為公。", "Book of Rites - Liyun"),
        ("選賢與能，講信修睦。", "Book of Rites - Liyun"),
        ("故人不獨親其親，不獨子其子。", "Book of Rites - Liyun"),
        ("使老有所終，壯有所用，幼有所長。", "Book of Rites - Liyun"),
        ("矜寡孤獨廢疾者皆有所養。", "Book of Rites - Liyun"),
        ("男有分，女有歸。", "Book of Rites - Liyun"),
        ("貨惡其棄於地也，不必藏於己。", "Book of Rites - Liyun"),
        ("力惡其不出於身也，不必為己。", "Book of Rites - Liyun"),
        ("是故謀閉而不興，盜竊亂賊而不作。", "Book of Rites - Liyun"),
        ("故外戶而不閉，是謂大同。", "Book of Rites - Liyun"),
        ("玉不琢，不成器；人不學，不知道。", "Book of Rites - Xueji"),
        ("是故學然後知不足，教然後知困。", "Book of Rites - Xueji"),
        ("知不足，然後能自反也。", "Book of Rites - Xueji"),
        ("知困，然後能自強也。", "Book of Rites - Xueji"),
        ("故曰：教學相長也。", "Book of Rites - Xueji"),
        ("凡學之道，嚴師為難。", "Book of Rites - Xueji"),
        ("師嚴然後道尊，道尊然後民知敬學。", "Book of Rites - Xueji"),
        ("善歌者使人繼其聲，善教者使人繼其志。", "Book of Rites - Xueji"),
        ("記問之學，不足以為人師。", "Book of Rites - Xueji"),
        ("必也其聽語乎，力不能問，然後語之。", "Book of Rites - Xueji"),
        ("語之而不知，雖舍之可也。", "Book of Rites - Xueji"),
        ("博學而不窮，篤行而不倦。", "Book of Rites - Ruxing"),
        ("君子之於學也，藏焉，修焉，息焉，游焉。", "Book of Rites - Xueji"),
    ]
    for i, (text, source) in enumerate(liji):
        chinese.append(
            {
                "id": f"cn_liji_{i}",
                "text": text,
                "source": source,
                "period": "CONFUCIAN",
                "century": -3,
            }
        )
    print(f"    - Book of Rites: {len([x for x in chinese if 'liji' in x['id']]):,} passages")

    with open("data/raw/chinese/chinese_native.json", "w", encoding="utf-8") as f:
        json.dump(chinese, f, ensure_ascii=False, indent=2)
    print(f"  Created {len(chinese)} Chinese passages")

    # ISLAMIC - 150+ REAL PASSAGES
    print("\n[3/4] Islamic texts (150+ real passages)...")
    os.makedirs("data/raw/islamic", exist_ok=True)

    islamic = []

    # === QURANIC VERSES (40+) ===
    quran = [
        ("وَلَا تَقْتُلُوا النَّفْسَ الَّتِي حَرَّمَ اللَّهُ إِلَّا بِالْحَقِّ", "Quran 6:151"),
        ("وَبِالْوَالِدَيْنِ إِحْسَانًا", "Quran 17:23"),
        ("إِمَّا يَبْلُغَنَّ عِندَكَ الْكِبَرَ أَحَدُهُمَا أَوْ كِلَاهُمَا فَلَا تَقُل لَّهُمَا أُفٍّ", "Quran 17:23"),
        ("وَلَا تَنْهَرْهُمَا وَقُل لَّهُمَا قَوْلًا كَرِيمًا", "Quran 17:23"),
        ("وَاخْفِضْ لَهُمَا جَنَاحَ الذُّلِّ مِنَ الرَّحْمَةِ", "Quran 17:24"),
        ("وَقُل رَّبِّ ارْحَمْهُمَا كَمَا رَبَّيَانِي صَغِيرًا", "Quran 17:24"),
        ("وَآتِ ذَا الْقُرْبَىٰ حَقَّهُ وَالْمِسْكِينَ وَابْنَ السَّبِيلِ", "Quran 17:26"),
        ("وَلَا تُبَذِّرْ تَبْذِيرًا", "Quran 17:26"),
        ("إِنَّ الْمُبَذِّرِينَ كَانُوا إِخْوَانَ الشَّيَاطِينِ", "Quran 17:27"),
        ("وَلَا تَجْعَلْ يَدَكَ مَغْلُولَةً إِلَىٰ عُنُقِكَ وَلَا تَبْسُطْهَا كُلَّ الْبَسْطِ", "Quran 17:29"),
        ("وَلَا تَقْرَبُوا الزِّنَا ۖ إِنَّهُ كَانَ فَاحِشَةً وَسَاءَ سَبِيلًا", "Quran 17:32"),
        ("وَلَا تَقْتُلُوا أَوْلَادَكُمْ خَشْيَةَ إِمْلَاقٍ", "Quran 17:31"),
        ("وَلَا تَقْرَبُوا مَالَ الْيَتِيمِ إِلَّا بِالَّتِي هِيَ أَحْسَنُ", "Quran 17:34"),
        ("وَأَوْفُوا بِالْعَهْدِ ۖ إِنَّ الْعَهْدَ كَانَ مَسْئُولًا", "Quran 17:34"),
        ("وَأَوْفُوا الْكَيْلَ إِذَا كِلْتُمْ وَزِنُوا بِالْقِسْطَاسِ الْمُسْتَقِيمِ", "Quran 17:35"),
        ("وَلَا تَقْفُ مَا لَيْسَ لَكَ بِهِ عِلْمٌ", "Quran 17:36"),
        ("إِنَّ السَّمْعَ وَالْبَصَرَ وَالْفُؤَادَ كُلُّ أُولَٰئِكَ كَانَ عَنْهُ مَسْئُولًا", "Quran 17:36"),
        ("وَلَا تَمْشِ فِي الْأَرْضِ مَرَحًا", "Quran 17:37"),
        ("إِنَّ اللَّهَ يَأْمُرُ بِالْعَدْلِ وَالْإِحْسَانِ وَإِيتَاءِ ذِي الْقُرْبَىٰ", "Quran 16:90"),
        ("وَيَنْهَىٰ عَنِ الْفَحْشَاءِ وَالْمُنكَرِ وَالْبَغْيِ", "Quran 16:90"),
        ("يَا أَيُّهَا الَّذِينَ آمَنُوا كُونُوا قَوَّامِينَ بِالْقِسْطِ", "Quran 4:135"),
        ("شُهَدَاءَ لِلَّهِ وَلَوْ عَلَىٰ أَنفُسِكُمْ أَوِ الْوَالِدَيْنِ وَالْأَقْرَبِينَ", "Quran 4:135"),
        ("وَإِذَا حَكَمْتُم بَيْنَ النَّاسِ أَن تَحْكُمُوا بِالْعَدْلِ", "Quran 4:58"),
        ("يَا أَيُّهَا الَّذِينَ آمَنُوا أَوْفُوا بِالْعُقُودِ", "Quran 5:1"),
        ("وَتَعَاوَنُوا عَلَى الْبِرِّ وَالتَّقْوَىٰ ۖ وَلَا تَعَاوَنُوا عَلَى الْإِثْمِ وَالْعُدْوَانِ", "Quran 5:2"),
        ("مَن قَتَلَ نَفْسًا بِغَيْرِ نَفْسٍ أَوْ فَسَادٍ فِي الْأَرْضِ فَكَأَنَّمَا قَتَلَ النَّاسَ جَمِيعًا", "Quran 5:32"),
        ("وَمَنْ أَحْيَاهَا فَكَأَنَّمَا أَحْيَا النَّاسَ جَمِيعًا", "Quran 5:32"),
        ("وَلَا يَجْرِمَنَّكُمْ شَنَآنُ قَوْمٍ عَلَىٰ أَلَّا تَعْدِلُوا", "Quran 5:8"),
        ("اعْدِلُوا هُوَ أَقْرَبُ لِلتَّقْوَىٰ", "Quran 5:8"),
        ("لَّيْسَ الْبِرَّ أَن تُوَلُّوا وُجُوهَكُمْ قِبَلَ الْمَشْرِقِ وَالْمَغْرِبِ", "Quran 2:177"),
        ("وَلَٰكِنَّ الْبِرَّ مَنْ آمَنَ بِاللَّهِ وَالْيَوْمِ الْآخِرِ", "Quran 2:177"),
        ("وَآتَى الْمَالَ عَلَىٰ حُبِّهِ ذَوِي الْقُرْبَىٰ وَالْيَتَامَىٰ وَالْمَسَاكِينَ", "Quran 2:177"),
        ("وَابْنَ السَّبِيلِ وَالسَّائِلِينَ وَفِي الرِّقَابِ", "Quran 2:177"),
        ("وَأَقَامَ الصَّلَاةَ وَآتَى الزَّكَاةَ", "Quran 2:177"),
        ("وَالْمُوفُونَ بِعَهْدِهِمْ إِذَا عَاهَدُوا", "Quran 2:177"),
        ("وَالصَّابِرِينَ فِي الْبَأْسَاءِ وَالضَّرَّاءِ وَحِينَ الْبَأْسِ", "Quran 2:177"),
        ("خُذِ الْعَفْوَ وَأْمُرْ بِالْعُرْفِ وَأَعْرِضْ عَنِ الْجَاهِلِينَ", "Quran 7:199"),
        ("وَالْكَاظِمِينَ الْغَيْظَ وَالْعَافِينَ عَنِ النَّاسِ", "Quran 3:134"),
        ("وَاللَّهُ يُحِبُّ الْمُحْسِنِينَ", "Quran 3:134"),
        ("ادْفَعْ بِالَّتِي هِيَ أَحْسَنُ فَإِذَا الَّذِي بَيْنَكَ وَبَيْنَهُ عَدَاوَةٌ كَأَنَّهُ وَلِيٌّ حَمِيمٌ", "Quran 41:34"),
        ("وَمَا يُلَقَّاهَا إِلَّا الَّذِينَ صَبَرُوا وَمَا يُلَقَّاهَا إِلَّا ذُو حَظٍّ عَظِيمٍ", "Quran 41:35"),
        ("إِنَّ اللَّهَ يَأْمُرُكُمْ أَن تُؤَدُّوا الْأَمَانَاتِ إِلَىٰ أَهْلِهَا", "Quran 4:58"),
    ]
    for i, (text, source) in enumerate(quran):
        islamic.append(
            {"id": f"quran_{i}", "text": text, "source": source, "period": "QURANIC", "century": 7}
        )
    print(f"    - Quranic verses: {len([x for x in islamic if 'quran' in x['id']]):,} passages")

    # === HADITH (110+) ===
    hadith = [
        ("لا ضرر ولا ضرار", "Hadith - Ibn Majah"),
        ("إنما الأعمال بالنيات وإنما لكل امرئ ما نوى", "Hadith - Bukhari 1"),
        ("المسلم من سلم المسلمون من لسانه ويده", "Hadith - Bukhari 10"),
        ("لا يؤمن أحدكم حتى يحب لأخيه ما يحب لنفسه", "Hadith - Bukhari 13"),
        ("من كان يؤمن بالله واليوم الآخر فليقل خيرا أو ليصمت", "Hadith - Bukhari 6018"),
        ("من كان يؤمن بالله واليوم الآخر فليكرم ضيفه", "Hadith - Bukhari 6019"),
        ("من كان يؤمن بالله واليوم الآخر فليصل رحمه", "Hadith - Bukhari 6138"),
        ("ارحموا من في الأرض يرحمكم من في السماء", "Hadith - Tirmidhi 1924"),
        ("الراحمون يرحمهم الرحمن", "Hadith - Abu Dawud 4941"),
        ("ليس منا من لم يرحم صغيرنا ويوقر كبيرنا", "Hadith - Tirmidhi 1919"),
        ("خيركم خيركم لأهله وأنا خيركم لأهلي", "Hadith - Tirmidhi 3895"),
        ("اتق الله حيثما كنت وأتبع السيئة الحسنة تمحها", "Hadith - Tirmidhi 1987"),
        ("وخالق الناس بخلق حسن", "Hadith - Tirmidhi 1987"),
        ("أكمل المؤمنين إيمانا أحسنهم خلقا", "Hadith - Abu Dawud 4682"),
        ("إن من أحبكم إلي وأقربكم مني مجلسا يوم القيامة أحاسنكم أخلاقا", "Hadith - Tirmidhi 2018"),
        ("ما من شيء أثقل في ميزان المؤمن يوم القيامة من حسن الخلق", "Hadith - Tirmidhi 2002"),
        ("البر حسن الخلق والإثم ما حاك في صدرك وكرهت أن يطلع عليه الناس", "Hadith - Muslim 2553"),
        ("الحياء من الإيمان", "Hadith - Bukhari 24"),
        ("الحياء لا يأتي إلا بخير", "Hadith - Bukhari 6117"),
        ("إن الله رفيق يحب الرفق في الأمر كله", "Hadith - Bukhari 6927"),
        ("ما كان الرفق في شيء إلا زانه وما نزع من شيء إلا شانه", "Hadith - Muslim 2594"),
        ("من يحرم الرفق يحرم الخير كله", "Hadith - Muslim 2592"),
        ("أد الأمانة إلى من ائتمنك ولا تخن من خانك", "Hadith - Abu Dawud 3535"),
        ("آية المنافق ثلاث إذا حدث كذب وإذا وعد أخلف وإذا اؤتمن خان", "Hadith - Bukhari 33"),
        ("الصدق يهدي إلى البر والبر يهدي إلى الجنة", "Hadith - Bukhari 6094"),
        ("وإن الكذب يهدي إلى الفجور والفجور يهدي إلى النار", "Hadith - Bukhari 6094"),
        ("عليكم بالصدق فإن الصدق يهدي إلى البر", "Hadith - Muslim 2607"),
        ("إياكم والكذب فإن الكذب يهدي إلى الفجور", "Hadith - Muslim 2607"),
        ("من غشنا فليس منا", "Hadith - Muslim 101"),
        ("كلكم راع وكلكم مسؤول عن رعيته", "Hadith - Bukhari 893"),
        ("الإمام راع ومسؤول عن رعيته", "Hadith - Bukhari 893"),
        ("والرجل راع في أهله ومسؤول عن رعيته", "Hadith - Bukhari 893"),
        ("والمرأة راعية في بيت زوجها ومسؤولة عن رعيتها", "Hadith - Bukhari 893"),
        ("انصر أخاك ظالما أو مظلوما", "Hadith - Bukhari 2444"),
        (
            "تنصره إذا كان مظلوما أفرأيت إذا كان ظالما كيف تنصره قال تحجزه أو تمنعه من الظلم فإن ذلك نصره",
            "Hadith - Bukhari 2444",
        ),
        ("المؤمن للمؤمن كالبنيان يشد بعضه بعضا", "Hadith - Bukhari 481"),
        ("مثل المؤمنين في توادهم وتراحمهم وتعاطفهم مثل الجسد الواحد", "Hadith - Muslim 2586"),
        ("إذا اشتكى منه عضو تداعى له سائر الجسد بالسهر والحمى", "Hadith - Muslim 2586"),
        ("المسلم أخو المسلم لا يظلمه ولا يسلمه", "Hadith - Bukhari 2442"),
        ("من كان في حاجة أخيه كان الله في حاجته", "Hadith - Bukhari 2442"),
        ("ومن فرج عن مسلم كربة فرج الله عنه كربة من كربات يوم القيامة", "Hadith - Bukhari 2442"),
        ("ومن ستر مسلما ستره الله يوم القيامة", "Hadith - Bukhari 2442"),
        ("لا تحاسدوا ولا تناجشوا ولا تباغضوا ولا تدابروا", "Hadith - Muslim 2564"),
        ("ولا يبع بعضكم على بيع بعض وكونوا عباد الله إخوانا", "Hadith - Muslim 2564"),
        ("بحسب امرئ من الشر أن يحقر أخاه المسلم", "Hadith - Muslim 2564"),
        ("كل المسلم على المسلم حرام دمه وماله وعرضه", "Hadith - Muslim 2564"),
        ("إياكم والظن فإن الظن أكذب الحديث", "Hadith - Bukhari 6064"),
        ("ولا تجسسوا ولا تحسسوا ولا تنافسوا", "Hadith - Bukhari 6064"),
        ("الظلم ظلمات يوم القيامة", "Hadith - Bukhari 2447"),
        ("اتقوا الظلم فإن الظلم ظلمات يوم القيامة", "Hadith - Muslim 2578"),
        ("واتقوا الشح فإن الشح أهلك من كان قبلكم", "Hadith - Muslim 2578"),
        ("أفضل الجهاد كلمة عدل عند سلطان جائر", "Hadith - Abu Dawud 4344"),
        (
            "سيد الشهداء حمزة بن عبد المطلب ورجل قام إلى إمام جائر فأمره ونهاه فقتله",
            "Hadith - Hakim 4884",
        ),
        ("إذا رأيت أمتي تهاب أن تقول للظالم يا ظالم فقد تودع منهم", "Hadith - Ahmad 6521"),
        ("من رأى منكم منكرا فليغيره بيده", "Hadith - Muslim 49"),
        ("فإن لم يستطع فبلسانه فإن لم يستطع فبقلبه وذلك أضعف الإيمان", "Hadith - Muslim 49"),
        ("أحب الناس إلى الله أنفعهم للناس", "Hadith - Tabarani 6026"),
        ("وأحب الأعمال إلى الله سرور تدخله على مسلم", "Hadith - Tabarani 6026"),
        ("أو تكشف عنه كربة أو تقضي عنه دينا أو تطرد عنه جوعا", "Hadith - Tabarani 6026"),
        (
            "ولأن أمشي مع أخي في حاجة أحب إلي من أن أعتكف في هذا المسجد شهرا",
            "Hadith - Tabarani 6026",
        ),
        (
            "الدين النصيحة قلنا لمن قال لله ولكتابه ولرسوله ولأئمة المسلمين وعامتهم",
            "Hadith - Muslim 55",
        ),
        ("ما نقصت صدقة من مال", "Hadith - Muslim 2588"),
        ("وما زاد الله عبدا بعفو إلا عزا", "Hadith - Muslim 2588"),
        ("وما تواضع أحد لله إلا رفعه الله", "Hadith - Muslim 2588"),
        ("اليد العليا خير من اليد السفلى", "Hadith - Bukhari 1427"),
        ("وابدأ بمن تعول", "Hadith - Bukhari 1427"),
        ("وخير الصدقة ما كان عن ظهر غنى", "Hadith - Bukhari 1427"),
        ("من استطاع منكم الباءة فليتزوج", "Hadith - Bukhari 5066"),
        ("فإنه أغض للبصر وأحصن للفرج", "Hadith - Bukhari 5066"),
        ("ومن لم يستطع فعليه بالصوم فإنه له وجاء", "Hadith - Bukhari 5066"),
        ("استوصوا بالنساء خيرا", "Hadith - Bukhari 3331"),
        (
            "خذوا عني خذوا عني قد جعل الله لهن سبيلا البكر بالبكر جلد مائة ونفي سنة",
            "Hadith - Muslim 1690",
        ),
        ("لا يفرك مؤمن مؤمنة إن كره منها خلقا رضي منها آخر", "Hadith - Muslim 1469"),
        ("أكمل المؤمنين إيمانا أحسنهم خلقا وخياركم خياركم لنسائهم", "Hadith - Tirmidhi 1162"),
        ("ما أكرمهن إلا كريم وما أهانهن إلا لئيم", "Hadith - Ibn Asakir"),
        ("اللهم إني أحرج حق الضعيفين اليتيم والمرأة", "Hadith - Ahmad 9664"),
        ("ألا أخبركم بخياركم قالوا بلى قال خياركم أحاسنكم أخلاقا", "Hadith - Bukhari 6035"),
        ("إنكم لن تسعوا الناس بأموالكم فليسعهم منكم بسط الوجه وحسن الخلق", "Hadith - Hakim 422"),
        ("تبسمك في وجه أخيك صدقة", "Hadith - Tirmidhi 1956"),
        ("وأمرك بالمعروف ونهيك عن المنكر صدقة", "Hadith - Tirmidhi 1956"),
        ("وإرشادك الرجل في أرض الضلال لك صدقة", "Hadith - Tirmidhi 1956"),
        ("وإماطتك الأذى والشوك والعظم عن الطريق لك صدقة", "Hadith - Tirmidhi 1956"),
        ("وإفراغك من دلوك في دلو أخيك لك صدقة", "Hadith - Tirmidhi 1956"),
        ("الكلمة الطيبة صدقة", "Hadith - Bukhari 2989"),
        ("وكل خطوة تمشيها إلى الصلاة صدقة", "Hadith - Bukhari 2989"),
        ("من دل على خير فله مثل أجر فاعله", "Hadith - Muslim 1893"),
        ("ليس الشديد بالصرعة إنما الشديد الذي يملك نفسه عند الغضب", "Hadith - Bukhari 6114"),
        ("لا تغضب فردد مرارا قال لا تغضب", "Hadith - Bukhari 6116"),
        ("إن الغضب من الشيطان وإن الشيطان خلق من النار", "Hadith - Abu Dawud 4784"),
        ("وإنما تطفأ النار بالماء فإذا غضب أحدكم فليتوضأ", "Hadith - Abu Dawud 4784"),
        ("لا يحل لمسلم أن يهجر أخاه فوق ثلاث ليال", "Hadith - Bukhari 6077"),
        ("يلتقيان فيعرض هذا ويعرض هذا وخيرهما الذي يبدأ بالسلام", "Hadith - Bukhari 6077"),
        ("أفشوا السلام بينكم", "Hadith - Muslim 54"),
        ("والذي نفسي بيده لا تدخلوا الجنة حتى تؤمنوا", "Hadith - Muslim 54"),
        (
            "ولا تؤمنوا حتى تحابوا أولا أدلكم على شيء إذا فعلتموه تحاببتم أفشوا السلام بينكم",
            "Hadith - Muslim 54",
        ),
        ("طعام الاثنين كافي الثلاثة وطعام الثلاثة كافي الأربعة", "Hadith - Bukhari 5392"),
        ("ما ملأ آدمي وعاء شرا من بطن", "Hadith - Tirmidhi 2380"),
        ("بحسب ابن آدم أكلات يقمن صلبه", "Hadith - Tirmidhi 2380"),
        ("فإن كان لا محالة فثلث لطعامه وثلث لشرابه وثلث لنفسه", "Hadith - Tirmidhi 2380"),
        ("إن الله كتب الإحسان على كل شيء", "Hadith - Muslim 1955"),
        ("فإذا قتلتم فأحسنوا القتلة وإذا ذبحتم فأحسنوا الذبح", "Hadith - Muslim 1955"),
        ("وليحد أحدكم شفرته وليرح ذبيحته", "Hadith - Muslim 1955"),
        ("عذبت امرأة في هرة سجنتها حتى ماتت", "Hadith - Bukhari 3318"),
        (
            "فلا هي أطعمتها ولا سقتها إذ حبستها ولا هي تركتها تأكل من خشاش الأرض",
            "Hadith - Bukhari 3318",
        ),
        ("بينما رجل يمشي بطريق اشتد عليه العطش فوجد بئرا فنزل فيها فشرب", "Hadith - Bukhari 2466"),
        ("ثم خرج فإذا كلب يلهث يأكل الثرى من العطش", "Hadith - Bukhari 2466"),
        ("فقال لقد بلغ هذا الكلب من العطش مثل الذي كان بلغ مني", "Hadith - Bukhari 2466"),
        ("فنزل البئر فملأ خفه ماء ثم أمسكه بفيه حتى رقي فسقى الكلب", "Hadith - Bukhari 2466"),
        ("فشكر الله له فغفر له", "Hadith - Bukhari 2466"),
        ("في كل كبد رطبة أجر", "Hadith - Bukhari 2466"),
    ]
    for i, (text, source) in enumerate(hadith):
        islamic.append(
            {"id": f"hadith_{i}", "text": text, "source": source, "period": "HADITH", "century": 9}
        )
    print(f"    - Hadith: {len([x for x in islamic if 'hadith' in x['id']]):,} passages")

    with open("data/raw/islamic/islamic_native.json", "w", encoding="utf-8") as f:
        json.dump(islamic, f, ensure_ascii=False, indent=2)
    print(f"  Created {len(islamic)} Islamic passages")

    # DEAR ABBY
    print("\n[4/4] Dear Abby...")
    abby_count = 0
    if (
        not os.path.exists("data/raw/dear_abby.csv")
        or os.path.getsize("data/raw/dear_abby.csv") < 10000
    ):
        # Check if in Drive (with retry for stale FUSE mount)
        drive_abby_path = f"{SAVE_DIR}/dear_abby.csv"
        found_in_drive = False

        # First attempt
        if os.path.exists(drive_abby_path):
            found_in_drive = True
        else:
            # Retry after refreshing Drive mount (FUSE can be stale)
            print(f"  First check failed, refreshing Drive...")
            try:
                _ = os.listdir(SAVE_DIR)  # Force FUSE refresh
                import time

                time.sleep(0.5)  # Brief pause for sync
                if os.path.exists(drive_abby_path):
                    found_in_drive = True
                    print(f"  Found after refresh!")
            except Exception as e:
                print(f"  Drive refresh error: {e}")

        if found_in_drive:
            shutil.copy(drive_abby_path, "data/raw/dear_abby.csv")
            print(f"  Loaded from Drive: {drive_abby_path}")
        else:
            print(f"  Not found in Drive at: {drive_abby_path}")
            # Show what IS in the Drive folder
            try:
                contents = os.listdir(SAVE_DIR) if os.path.exists(SAVE_DIR) else []
                print(f"  Drive folder contents: {contents[:10]}")
            except:
                pass
            try:
                subprocess.run(
                    [
                        "kaggle",
                        "datasets",
                        "download",
                        "-d",
                        "thedevastator/20000-dear-abby-questions",
                        "-p",
                        "data/raw/",
                        "--unzip",
                    ],
                    check=True,
                    timeout=120,
                )
                print("  Downloaded from Kaggle")
            except:
                print("  Kaggle failed - creating minimal fallback")
                fallback = [
                    {"question_only": f"Dear Abby, I have a problem {i}", "year": 1990 + i % 30}
                    for i in range(100)
                ]
                pd.DataFrame(fallback).to_csv("data/raw/dear_abby.csv", index=False)
    else:
        print("  Already exists")

    # Count Dear Abby samples
    try:
        df = pd.read_csv("data/raw/dear_abby.csv")
        abby_count = len(
            [
                1
                for _, row in df.iterrows()
                if str(row.get("question_only", "")) != "nan"
                and 50 <= len(str(row.get("question_only", ""))) <= 2000
            ]
        )
    except:
        abby_count = 0

    # Warning for insufficient Dear Abby data
    if abby_count < 1000:
        print("\n" + "!" * 60)
        print("CRITICAL: Dear Abby corpus is too small!")
        print("The semitic_to_non_semitic split WILL FAIL without this data.")
        print("\nTo fix:")
        print("1. Download from: kaggle.com/datasets/thedevastator/20000-dear-abby-questions")
        print("2. Upload dear_abby.csv to your Google Drive BIP_v10 folder")
        print("3. Set REFRESH_DATA_FROM_SOURCE = True and rerun")
        print("!" * 60 + "\n")

    print("\n" + "=" * 60)
    print("Downloads complete")
    print("=" * 60)


In [None]:
# @title 3. Patterns + Normalization { display-mode: "form" }
# @markdown BIP v10.9: Complete native patterns for moral concepts in 7 languages
# @markdown - Added: Sanskrit, Pali patterns
# @markdown - Added: NLP improvements (negation detection, modal classification)

import re
import unicodedata
from enum import Enum, auto

print("=" * 60)
print("TEXT NORMALIZATION & PATTERNS")
print("=" * 60)


# ===== TEXT NORMALIZATION =====
def normalize_hebrew(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"[\u0591-\u05C7]", "", text)  # Remove nikud
    for final, regular in [
        ("\u05da", "\u05db"),
        ("\u05dd", "\u05de"),
        ("\u05df", "\u05e0"),
        ("\u05e3", "\u05e4"),
        ("\u05e5", "\u05e6"),
    ]:
        text = text.replace(final, regular)
    return text


def normalize_arabic(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"[\u064B-\u065F]", "", text)  # Remove tashkeel
    text = text.replace("\u0640", "")  # Remove tatweel
    for v in ["\u0623", "\u0625", "\u0622", "\u0671"]:
        text = text.replace(v, "\u0627")
    text = text.replace("\u0629", "\u0647").replace("\u0649", "\u064a")
    return text


# NEW in v10.9: Sanskrit normalization
def normalize_sanskrit(text):
    """Normalize Sanskrit/Devanagari text."""
    text = unicodedata.normalize("NFC", text)
    # Remove vedic accents and other diacriticals
    text = re.sub(r"[\u0951-\u0954]", "", text)  # Vedic tone marks
    text = re.sub(r"[\u0900-\u0902]", "", text)  # Chandrabindu variants
    return text


# NEW in v10.9: Pali normalization
def normalize_pali(text):
    """Normalize Pali text (romanized or script)."""
    text = unicodedata.normalize("NFC", text)
    # Normalize romanized Pali diacritics
    text = text.lower()
    # Handle common Pali romanization variations
    text = text.replace("ṃ", "m").replace("ṅ", "n").replace("ñ", "n")
    text = text.replace("ṭ", "t").replace("ḍ", "d").replace("ṇ", "n")
    text = text.replace("ḷ", "l").replace("ā", "a").replace("ī", "i").replace("ū", "u")
    return text


def normalize_text(text, language):
    if language in ["hebrew", "aramaic"]:
        return normalize_hebrew(text)
    elif language == "arabic":
        return normalize_arabic(text)
    elif language == "classical_chinese":
        return unicodedata.normalize("NFKC", text)
    elif language == "sanskrit":
        return normalize_sanskrit(text)
    elif language == "pali":
        return normalize_pali(text)
    else:
        return unicodedata.normalize("NFKC", text.lower())


# ===== BOND AND HOHFELD TYPES =====
class BondType(Enum):
    HARM_PREVENTION = auto()
    RECIPROCITY = auto()
    AUTONOMY = auto()
    PROPERTY = auto()
    FAMILY = auto()
    AUTHORITY = auto()
    CARE = auto()
    FAIRNESS = auto()
    CONTRACT = auto()
    NONE = auto()


class HohfeldState(Enum):
    OBLIGATION = auto()
    RIGHT = auto()
    LIBERTY = auto()
    NO_RIGHT = auto()


# ===== COMPLETE BOND PATTERNS =====
ALL_BOND_PATTERNS = {
    "hebrew": {
        BondType.HARM_PREVENTION: [
            r"\u05d4\u05e8\u05d2",
            r"\u05e8\u05e6\u05d7",
            r"\u05e0\u05d6\u05e7",
            r"\u05d4\u05db\u05d4",
            r"\u05d4\u05e6\u05d9\u05dc",
            r"\u05e9\u05de\u05e8",
            r"\u05e4\u05e7\u05d5\u05d7.\u05e0\u05e4\u05e9",
        ],
        BondType.RECIPROCITY: [
            r"\u05d2\u05de\u05d5\u05dc",
            r"\u05d4\u05e9\u05d9\u05d1",
            r"\u05e4\u05e8\u05e2",
            r"\u05e0\u05ea\u05df.*\u05e7\u05d1\u05dc",
            r"\u05de\u05d3\u05d4.\u05db\u05e0\u05d2\u05d3",
        ],
        BondType.AUTONOMY: [
            r"\u05d1\u05d7\u05e8",
            r"\u05e8\u05e6\u05d5\u05df",
            r"\u05d7\u05e4\u05e9",
            r"\u05e2\u05e6\u05de",
        ],
        BondType.PROPERTY: [
            r"\u05e7\u05e0\u05d4",
            r"\u05de\u05db\u05e8",
            r"\u05d2\u05d6\u05dc",
            r"\u05d2\u05e0\u05d1",
            r"\u05de\u05de\u05d5\u05df",
            r"\u05e0\u05db\u05e1",
            r"\u05d9\u05e8\u05e9",
        ],
        BondType.FAMILY: [
            r"\u05d0\u05d1",
            r"\u05d0\u05de",
            r"\u05d1\u05e0",
            r"\u05db\u05d1\u05d3.*\u05d0\u05d1",
            r"\u05db\u05d1\u05d3.*\u05d0\u05de",
            r"\u05de\u05e9\u05e4\u05d7\u05d4",
            r"\u05d0\u05d7",
            r"\u05d0\u05d7\u05d5\u05ea",
        ],
        BondType.AUTHORITY: [
            r"\u05de\u05dc\u05db",
            r"\u05e9\u05d5\u05e4\u05d8",
            r"\u05e6\u05d5\u05d4",
            r"\u05ea\u05d5\u05e8\u05d4",
            r"\u05de\u05e6\u05d5\u05d4",
            r"\u05d3\u05d9\u05df",
            r"\u05d7\u05e7",
        ],
        BondType.CARE: [
            r"\u05d7\u05e1\u05d3",
            r"\u05e8\u05d7\u05de",
            r"\u05e2\u05d6\u05e8",
            r"\u05ea\u05de\u05db",
            r"\u05e6\u05d3\u05e7\u05d4",
        ],
        BondType.FAIRNESS: [
            r"\u05e6\u05d3\u05e7",
            r"\u05de\u05e9\u05e4\u05d8",
            r"\u05d9\u05e9\u05e8",
            r"\u05e9\u05d5\u05d4",
        ],
        BondType.CONTRACT: [
            r"\u05d1\u05e8\u05d9\u05ea",
            r"\u05e0\u05d3\u05e8",
            r"\u05e9\u05d1\u05d5\u05e2",
            r"\u05d4\u05ea\u05d7\u05d9\u05d1",
            r"\u05e2\u05e8\u05d1",
        ],
    },
    "aramaic": {
        BondType.HARM_PREVENTION: [
            r"\u05e7\u05d8\u05dc",
            r"\u05e0\u05d6\u05e7",
            r"\u05d7\u05d1\u05dc",
            r"\u05e9\u05d6\u05d9\u05d1",
            r"\u05e4\u05e6\u05d9",
        ],
        BondType.RECIPROCITY: [r"\u05e4\u05e8\u05e2", r"\u05e9\u05dc\u05de", r"\u05d0\u05d2\u05e8"],
        BondType.AUTONOMY: [r"\u05e6\u05d1\u05d9", r"\u05e8\u05e2\u05d5"],
        BondType.PROPERTY: [
            r"\u05d6\u05d1\u05e0",
            r"\u05e7\u05e0\u05d4",
            r"\u05d2\u05d6\u05dc",
            r"\u05de\u05de\u05d5\u05e0\u05d0",
            r"\u05e0\u05db\u05e1\u05d9",
        ],
        BondType.FAMILY: [
            r"\u05d0\u05d1\u05d0",
            r"\u05d0\u05de\u05d0",
            r"\u05d1\u05e8\u05d0",
            r"\u05d1\u05e8\u05ea\u05d0",
            r"\u05d9\u05e7\u05e8",
            r"\u05d0\u05d7\u05d0",
        ],
        BondType.AUTHORITY: [
            r"\u05de\u05dc\u05db\u05d0",
            r"\u05d3\u05d9\u05e0\u05d0",
            r"\u05d3\u05d9\u05d9\u05e0\u05d0",
            r"\u05e4\u05e7\u05d5\u05d3\u05d0",
            r"\u05d0\u05d5\u05e8\u05d9\u05ea",
        ],
        BondType.CARE: [r"\u05d7\u05e1\u05d3", r"\u05e8\u05d7\u05de", r"\u05e1\u05e2\u05d3"],
        BondType.FAIRNESS: [
            r"\u05d3\u05d9\u05e0\u05d0",
            r"\u05e7\u05e9\u05d5\u05d8",
            r"\u05ea\u05e8\u05d9\u05e6",
        ],
        BondType.CONTRACT: [
            r"\u05e7\u05d9\u05de\u05d0",
            r"\u05e9\u05d1\u05d5\u05e2\u05d4",
            r"\u05e0\u05d3\u05e8\u05d0",
            r"\u05e2\u05e8\u05d1\u05d0",
        ],
    },
    "classical_chinese": {
        BondType.HARM_PREVENTION: [
            r"\u6bba",
            r"\u5bb3",
            r"\u50b7",
            r"\u6551",
            r"\u8b77",
            r"\u885b",
            r"\u66b4",
        ],
        BondType.RECIPROCITY: [r"\u5831", r"\u9084", r"\u511f", r"\u8ced", r"\u7b54"],
        BondType.AUTONOMY: [r"\u81ea", r"\u7531", r"\u4efb", r"\u610f", r"\u5fd7"],
        BondType.PROPERTY: [
            r"\u8ca1",
            r"\u7269",
            r"\u7522",
            r"\u76dc",
            r"\u7aca",
            r"\u8ce3",
            r"\u8cb7",
        ],
        BondType.FAMILY: [
            r"\u5b5d",
            r"\u7236",
            r"\u6bcd",
            r"\u89aa",
            r"\u5b50",
            r"\u5f1f",
            r"\u5144",
            r"\u5bb6",
        ],
        BondType.AUTHORITY: [
            r"\u541b",
            r"\u81e3",
            r"\u738b",
            r"\u547d",
            r"\u4ee4",
            r"\u6cd5",
            r"\u6cbb",
        ],
        BondType.CARE: [r"\u4ec1", r"\u611b", r"\u6148", r"\u60e0", r"\u6069", r"\u6190"],
        BondType.FAIRNESS: [r"\u7fa9", r"\u6b63", r"\u516c", r"\u5e73", r"\u5747"],
        BondType.CONTRACT: [r"\u7d04", r"\u76df", r"\u8a93", r"\u8afe", r"\u4fe1"],
    },
    "arabic": {
        BondType.HARM_PREVENTION: [
            r"\u0642\u062a\u0644",
            r"\u0636\u0631\u0631",
            r"\u0627\u0630[\u064a\u0649]",
            r"\u0638\u0644\u0645",
            r"\u0627\u0646\u0642\u0630",
            r"\u062d\u0641\u0638",
            r"\u0627\u0645\u0627\u0646",
        ],
        BondType.RECIPROCITY: [
            r"\u062c\u0632\u0627",
            r"\u0631\u062f",
            r"\u0642\u0635\u0627\u0635",
            r"\u0645\u062b\u0644",
            r"\u0639\u0648\u0636",
        ],
        BondType.AUTONOMY: [
            r"\u062d\u0631",
            r"\u0627\u0631\u0627\u062f\u0629",
            r"\u0627\u062e\u062a\u064a\u0627\u0631",
            r"\u0645\u0634\u064a\u0626",
        ],
        BondType.PROPERTY: [
            r"\u0645\u0627\u0644",
            r"\u0645\u0644\u0643",
            r"\u0633\u0631\u0642",
            r"\u0628\u064a\u0639",
            r"\u0634\u0631\u0627",
            r"\u0645\u064a\u0631\u0627\u062b",
            r"\u063a\u0635\u0628",
        ],
        BondType.FAMILY: [
            r"\u0648\u0627\u0644\u062f",
            r"\u0627\u0628\u0648",
            r"\u0627\u0645",
            r"\u0627\u0628\u0646",
            r"\u0628\u0646\u062a",
            r"\u0627\u0647\u0644",
            r"\u0642\u0631\u0628[\u064a\u0649]",
            r"\u0631\u062d\u0645",
        ],
        BondType.AUTHORITY: [
            r"\u0637\u0627\u0639",
            r"\u0627\u0645\u0631",
            r"\u062d\u0643\u0645",
            r"\u0633\u0644\u0637\u0627\u0646",
            r"\u062e\u0644\u064a\u0641",
            r"\u0627\u0645\u0627\u0645",
            r"\u0634\u0631\u064a\u0639",
        ],
        BondType.CARE: [
            r"\u0631\u062d\u0645",
            r"\u0627\u062d\u0633\u0627\u0646",
            r"\u0639\u0637\u0641",
            r"\u0635\u062f\u0642",
            r"\u0632\u0643\u0627",
        ],
        BondType.FAIRNESS: [
            r"\u0639\u062f\u0644",
            r"\u0642\u0633\u0637",
            r"\u062d\u0642",
            r"\u0627\u0646\u0635\u0627\u0641",
            r"\u0633\u0648[\u064a\u0649]",
        ],
        BondType.CONTRACT: [
            r"\u0639\u0647\u062f",
            r"\u0639\u0642\u062f",
            r"\u0646\u0630\u0631",
            r"\u064a\u0645\u064a\u0646",
            r"\u0648\u0641\u0627",
            r"\u0627\u0645\u0627\u0646",
        ],
    },
    "english": {
        BondType.HARM_PREVENTION: [
            r"\bkill",
            r"\bmurder",
            r"\bharm",
            r"\bhurt",
            r"\bsave",
            r"\bprotect",
            r"\bviolence",
        ],
        BondType.RECIPROCITY: [
            r"\breturn",
            r"\brepay",
            r"\bexchange",
            r"\bgive.*back",
            r"\breciproc",
        ],
        BondType.AUTONOMY: [
            r"\bfree",
            r"\bchoice",
            r"\bchoose",
            r"\bconsent",
            r"\bautonomy",
            r"\bright to",
        ],
        BondType.PROPERTY: [
            r"\bsteal",
            r"\btheft",
            r"\bown",
            r"\bproperty",
            r"\bbelong",
            r"\binherit",
        ],
        BondType.FAMILY: [
            r"\bfather",
            r"\bmother",
            r"\bparent",
            r"\bchild",
            r"\bfamily",
            r"\bhonor.*parent",
        ],
        BondType.AUTHORITY: [
            r"\bobey",
            r"\bcommand",
            r"\bauthority",
            r"\blaw",
            r"\brule",
            r"\bgovern",
        ],
        BondType.CARE: [r"\bcare", r"\bhelp", r"\bkind", r"\bcompassion", r"\bcharity", r"\bmercy"],
        BondType.FAIRNESS: [r"\bfair", r"\bjust", r"\bequal", r"\bequity", r"\bright\b"],
        BondType.CONTRACT: [
            r"\bpromise",
            r"\bcontract",
            r"\bagreem",
            r"\bvow",
            r"\boath",
            r"\bcommit",
        ],
    },
    # NEW in v10.9: Sanskrit patterns (Devanagari)
    "sanskrit": {
        BondType.HARM_PREVENTION: [
            r"हिंसा",
            r"अहिंसा",
            r"वध",
            r"रक्षा",
            r"त्राण",
        ],  # himsa, ahimsa, vadha, raksha, trana
        BondType.RECIPROCITY: [
            r"प्रतिदान",
            r"प्रत्युपकार",
            r"दान",
            r"ऋण",
        ],  # pratidana, pratyupakara, dana, rna
        BondType.AUTONOMY: [r"स्वतंत्र", r"मोक्ष", r"स्वेच्छा"],  # swatantra, moksha, sveccha
        BondType.PROPERTY: [r"धन", r"स्व", r"चोर", r"दाय"],  # dhana, sva, chora, daya
        BondType.FAMILY: [r"पितृ", r"मातृ", r"पुत्र", r"कुल", r"गृह"],  # pitri, matri, putra, kula, grha
        BondType.AUTHORITY: [
            r"राज",
            r"धर्म",
            r"विधि",
            r"नियम",
            r"शास्त्र",
        ],  # raja, dharma, vidhi, niyama, shastra
        BondType.CARE: [
            r"करुणा",
            r"दया",
            r"प्रेम",
            r"मैत्री",
            r"सेवा",
        ],  # karuna, daya, prema, maitri, seva
        BondType.FAIRNESS: [r"न्याय", r"समता", r"धर्म", r"ऋत"],  # nyaya, samata, dharma, rta
        BondType.CONTRACT: [
            r"प्रतिज्ञा",
            r"संविद",
            r"वचन",
            r"शपथ",
        ],  # pratijna, samvid, vachana, shapatha
    },
    # NEW in v10.9: Pali patterns (romanized)
    "pali": {
        BondType.HARM_PREVENTION: [r"himsa", r"ahimsa", r"panatipata", r"rakkhati"],
        BondType.RECIPROCITY: [r"dana", r"patidana", r"ina"],
        BondType.AUTONOMY: [r"vimutti", r"nibbana", r"attadhipa"],
        BondType.PROPERTY: [r"dhana", r"theyya", r"adinnadana"],
        BondType.FAMILY: [r"mata", r"pita", r"putta", r"kula"],
        BondType.AUTHORITY: [r"raja", r"dhamma", r"vinaya", r"sikkhapada"],
        BondType.CARE: [r"karuna", r"metta", r"mudita", r"upekkha"],
        BondType.FAIRNESS: [r"samma", r"dhamma", r"sacca"],
        BondType.CONTRACT: [r"patijna", r"vacana", r"sacca"],
    },
}

# ===== COMPLETE HOHFELD PATTERNS =====
ALL_HOHFELD_PATTERNS = {
    "hebrew": {
        HohfeldState.OBLIGATION: [
            r"\u05d7\u05d9\u05d9\u05d1",
            r"\u05e6\u05e8\u05d9\u05db",
            r"\u05de\u05d5\u05db\u05e8\u05d7",
            r"\u05de\u05e6\u05d5\u05d5\u05d4",
        ],
        HohfeldState.RIGHT: [
            r"\u05d6\u05db\u05d5\u05ea",
            r"\u05e8\u05e9\u05d0\u05d9",
            r"\u05d6\u05db\u05d0\u05d9",
            r"\u05de\u05d2\u05d9\u05e2",
        ],
        HohfeldState.LIBERTY: [
            r"\u05de\u05d5\u05ea\u05e8",
            r"\u05e8\u05e9\u05d5\u05ea",
            r"\u05e4\u05d8\u05d5\u05e8",
            r"\u05d9\u05db\u05d5\u05dc",
        ],
        HohfeldState.NO_RIGHT: [
            r"\u05d0\u05e1\u05d5\u05e8",
            r"\u05d0\u05d9\u05e0\u05d5 \u05e8\u05e9\u05d0\u05d9",
            r"\u05d0\u05d9\u05df.*\u05d6\u05db\u05d5\u05ea",
        ],
    },
    "aramaic": {
        HohfeldState.OBLIGATION: [
            r"\u05d7\u05d9\u05d9\u05d1",
            r"\u05de\u05d7\u05d5\u05d9\u05d1",
            r"\u05d1\u05e2\u05d9",
        ],
        HohfeldState.RIGHT: [
            r"\u05d6\u05db\u05d5\u05ea",
            r"\u05e8\u05e9\u05d0\u05d9",
            r"\u05d6\u05db\u05d9",
        ],
        HohfeldState.LIBERTY: [
            r"\u05e9\u05e8\u05d9",
            r"\u05de\u05d5\u05ea\u05e8",
            r"\u05e4\u05d8\u05d5\u05e8",
        ],
        HohfeldState.NO_RIGHT: [
            r"\u05d0\u05e1\u05d5\u05e8",
            r"\u05dc\u05d0.*\u05e8\u05e9\u05d0\u05d9",
        ],
    },
    "classical_chinese": {
        HohfeldState.OBLIGATION: [r"\u5fc5", r"\u9808", r"\u7576", r"\u61c9", r"\u5b9c"],
        HohfeldState.RIGHT: [r"\u53ef", r"\u5f97", r"\u6b0a", r"\u5b9c"],
        HohfeldState.LIBERTY: [r"\u8a31", r"\u4efb", r"\u807d", r"\u514d"],
        HohfeldState.NO_RIGHT: [r"\u4e0d\u53ef", r"\u52ff", r"\u7981", r"\u83ab", r"\u975e"],
    },
    "arabic": {
        HohfeldState.OBLIGATION: [
            r"\u064a\u062c\u0628",
            r"\u0648\u0627\u062c\u0628",
            r"\u0641\u0631\u0636",
            r"\u0644\u0627\u0632\u0645",
            r"\u0648\u062c\u0648\u0628",
        ],
        HohfeldState.RIGHT: [
            r"\u062d\u0642",
            r"\u064a\u062d\u0642",
            r"\u062c\u0627\u0626\u0632",
            r"\u064a\u062c\u0648\u0632",
        ],
        HohfeldState.LIBERTY: [
            r"\u0645\u0628\u0627\u062d",
            r"\u062d\u0644\u0627\u0644",
            r"\u062c\u0627\u0626\u0632",
            r"\u0627\u0628\u0627\u062d",
        ],
        HohfeldState.NO_RIGHT: [
            r"\u062d\u0631\u0627\u0645",
            r"\u0645\u062d\u0631\u0645",
            r"\u0645\u0645\u0646\u0648\u0639",
            r"\u0644\u0627 \u064a\u062c\u0648\u0632",
            r"\u0646\u0647[\u064a\u0649]",
        ],
    },
    "english": {
        HohfeldState.OBLIGATION: [r"\bmust\b", r"\bshall\b", r"\bobligat", r"\bduty", r"\brequir"],
        HohfeldState.RIGHT: [r"\bright\b", r"\bentitle", r"\bdeserve", r"\bclaim"],
        HohfeldState.LIBERTY: [r"\bmay\b", r"\bpermit", r"\ballow", r"\bfree to"],
        HohfeldState.NO_RIGHT: [r"\bforbid", r"\bprohibit", r"\bmust not", r"\bshall not"],
    },
    # NEW in v10.9: Sanskrit Hohfeld patterns (Devanagari)
    "sanskrit": {
        HohfeldState.OBLIGATION: [r"कर्तव्य", r"अवश्य", r"नियम", r"विधि"],  # kartavya, avashya
        HohfeldState.RIGHT: [r"अधिकार", r"स्वत्व"],  # adhikara, svatva
        HohfeldState.LIBERTY: [r"शक्य", r"अनुज्ञा", r"उचित"],  # shakya, anujña
        HohfeldState.NO_RIGHT: [r"निषिद्ध", r"वर्जित", r"अकर्तव्य"],  # nishiddha, varjita
    },
    # NEW in v10.9: Pali Hohfeld patterns (romanized)
    "pali": {
        HohfeldState.OBLIGATION: [r"kicca", r"karaniiya", r"dhammo"],
        HohfeldState.RIGHT: [r"adhikaara", r"bhaaga"],
        HohfeldState.LIBERTY: [r"anujaanati", r"kappati"],
        HohfeldState.NO_RIGHT: [r"nisiddha", r"akaraniya", r"na kappati"],
    },
}


# ===== CONTEXT MARKERS FOR GRAMMAR-AWARE EXTRACTION =====
# These help distinguish "thou shalt not kill" from "he killed"
CONTEXT_MARKERS = {
    "hebrew": {
        "negation": [r"לא", r"אל", r"אין", r"בלי", r"אינ"],
        "obligation": [r"חייב", r"צריך", r"מוכרח", r"צווה"],
        "prohibition": [r"אסור", r"אל.*ת"],
        "permission": [r"מותר", r"רשאי", r"פטור"],
    },
    "aramaic": {
        "negation": [r"לא", r"לית", r"לאו"],
        "obligation": [r"חייב", r"בעי"],
        "prohibition": [r"אסור"],
        "permission": [r"שרי", r"מותר"],
    },
    "classical_chinese": {
        "negation": [r"不", r"非", r"無", r"未", r"毋"],
        "obligation": [r"必", r"當", r"須", r"應", r"宜"],
        "prohibition": [r"勿", r"禁", r"莫", r"不可"],
        "permission": [r"可", r"得", r"許"],
    },
    "arabic": {
        "negation": [r"لا", r"ما", r"ليس", r"لم", r"غير"],
        "obligation": [r"يجب", r"واجب", r"فرض", r"عليه"],
        "prohibition": [r"حرام", r"محرم", r"لا يجوز", r"نهى"],
        "permission": [r"حلال", r"مباح", r"جائز"],
    },
    "english": {
        "negation": [r"not", r"no", r"never", r"neither", r"n't"],
        "obligation": [r"must", r"shall", r"should", r"ought", r"required"],
        "prohibition": [r"forbid", r"prohibit", r"must not", r"shall not", r"don't"],
        "permission": [r"may", r"can", r"allowed", r"permit"],
    },
    # NEW in v10.9: Sanskrit context markers
    "sanskrit": {
        "negation": [r"न", r"मा", r"अ"],  # na, mā, a- prefix
        "obligation": [r"कर्तव्य", r"अवश्य", r"विधि"],
        "prohibition": [r"निषिद्ध", r"वर्जित", r"मा"],
        "permission": [r"शक्य", r"अनुज्ञा"],
    },
    # NEW in v10.9: Pali context markers
    "pali": {
        "negation": [r"na", r"ma", r"a-"],
        "obligation": [r"kicca", r"karaniya"],
        "prohibition": [r"nisiddha", r"akaraniya"],
        "permission": [r"anujaanati", r"kappati"],
    },
}


def detect_context(text, language, match_pos, window=30):
    """
    Detect grammatical context around a pattern match.
    Returns: ('prescriptive'/'descriptive'/'unknown', marker_type or None)
    """
    markers = CONTEXT_MARKERS.get(language, {})
    if not markers:
        return "unknown", None

    start = max(0, match_pos - window)
    end = min(len(text), match_pos + window)
    window_text = text[start:end]

    # Check for deontic markers (prescriptive = moral statement)
    for marker_type in ["prohibition", "obligation", "permission"]:
        for pattern in markers.get(marker_type, []):
            if re.search(pattern, window_text):
                return "prescriptive", marker_type

    # Check for simple negation (may be descriptive)
    for pattern in markers.get("negation", []):
        if re.search(pattern, window_text):
            return "descriptive", "negated"

    return "descriptive", None


# ===== NLP IMPROVEMENTS (v10.9 Phase 1) =====
# These provide negation detection and modal classification without external dependencies

NEGATION_CUES = {
    "english": ["not", "no", "never", "neither", "nor", "n't", "without", "lack", "none"],
    "classical_chinese": ["不", "非", "無", "莫", "勿", "未", "弗", "毋", "否"],
    "arabic": ["لا", "ما", "لم", "لن", "ليس", "غير", "بدون"],
    "hebrew": ["לא", "אל", "בלי", "אין", "מבלי"],
    "aramaic": ["לא", "לית", "לאו"],
    "sanskrit": ["न", "मा", "अ"],  # na, mā, a- (privative prefix)
    "pali": ["na", "ma", "a", "an"],
}

MODAL_CLASSIFICATION = {
    "english": {
        "obligation": ["must", "shall", "have to", "ought to", "need to", "required", "obligated"],
        "permission": ["may", "can", "allowed", "permitted", "free to", "entitled"],
        "prohibition": ["must not", "shall not", "cannot", "forbidden", "prohibited", "banned"],
        "supererogation": ["should", "ought", "would be good", "ideally", "preferably"],
    },
    "classical_chinese": {
        "obligation": ["必", "當", "宜", "須", "應", "要"],
        "permission": ["可", "得", "許", "容", "能"],
        "prohibition": ["不可", "不得", "勿", "莫", "禁", "不許", "不宜"],
        "supererogation": ["善", "美", "德", "宜"],
    },
    "arabic": {
        "obligation": ["يجب", "فرض", "واجب", "لازم", "فريضة"],
        "permission": ["يجوز", "مباح", "حلال", "جائز"],
        "prohibition": ["حرام", "محرم", "ممنوع", "لا يجوز", "محظور"],
        "supererogation": ["مستحب", "سنة", "مندوب", "نافلة"],
    },
    "hebrew": {
        "obligation": ["חייב", "מצווה", "צריך", "מוכרח", "חובה"],
        "permission": ["מותר", "רשאי", "יכול", "היתר"],
        "prohibition": ["אסור", "לא יעשה", "אל", "איסור"],
        "supererogation": ["ראוי", "טוב", "מידת חסידות", "לפנים משורת הדין"],
    },
    "sanskrit": {
        "obligation": ["कर्तव्य", "अवश्य", "नियम"],  # kartavya, avashya, niyama
        "permission": ["शक्य", "अनुज्ञा"],  # shakya, anujña
        "prohibition": ["निषिद्ध", "वर्जित", "मा"],  # nishiddha, varjita, mā
    },
    "pali": {
        "obligation": ["kicca", "karaniya", "dhamma"],
        "permission": ["kappati", "anujanati"],
        "prohibition": ["akappiya", "akaraniya", "na kappati"],
    },
}


def enhanced_extract_bond(text: str, language: str) -> dict:
    """
    Enhanced bond extraction with negation + modal detection.
    Phase 1 implementation - no external NLP dependencies required.

    Returns dict with:
        - bond_type: BondType or None
        - hohfeld_state: str (OBLIGATION/RIGHT/LIBERTY/NO_RIGHT)
        - negated: bool
        - modal: str or None (the matched modal marker)
        - confidence: float
        - context: str (prescriptive/descriptive/unknown)
    """
    # 1. Normalize text
    normalized = normalize_text(text, language)

    # 2. Check negation
    negation_cues = NEGATION_CUES.get(language, [])
    is_negated = any(cue in normalized for cue in negation_cues)

    # 3. Check modal and classify deontic status
    modal_status = "unknown"
    modal_text = None
    for status, markers in MODAL_CLASSIFICATION.get(language, {}).items():
        for marker in markers:
            if marker in normalized:
                modal_status = status
                modal_text = marker
                break
        if modal_status != "unknown":
            break

    # 4. Map modal to Hohfeld state
    hohfeld_map = {
        "obligation": "OBLIGATION",
        "permission": "LIBERTY",
        "prohibition": "NO_RIGHT",
        "supererogation": "LIBERTY",
        "unknown": "OBLIGATION",  # Default assumption
    }
    hohfeld = hohfeld_map[modal_status]

    # 5. Pattern matching for bond type
    bond_type = None
    confidence = 0.5
    for bt, patterns in ALL_BOND_PATTERNS.get(language, {}).items():
        for pattern in patterns:
            if re.search(pattern, normalized):
                bond_type = bt
                confidence = 0.9
                break
        if bond_type:
            break

    # 6. Adjust confidence for negation
    if is_negated:
        confidence *= 0.8  # Lower confidence for negated statements

    # 7. Determine context
    if modal_status in ["obligation", "prohibition"]:
        context = "prescriptive"
    elif modal_status == "permission":
        context = "descriptive"  # Permissions are often statements of fact
    else:
        context = "unknown"

    return {
        "bond_type": bond_type,
        "hohfeld_state": hohfeld,
        "negated": is_negated,
        "modal": modal_text,
        "confidence": confidence,
        "context": context,
    }


print("\nContext markers defined for grammar-aware extraction")
print("  Detects: negation, obligation, prohibition, permission")

print(f"\nPatterns defined for {len(ALL_BOND_PATTERNS)} languages:")
for lang in ALL_BOND_PATTERNS:
    n = sum(len(p) for p in ALL_BOND_PATTERNS[lang].values())
    print(f"  {lang}: {n} bond patterns")

print("\nNLP improvements (Phase 1):")
print(f"  NEGATION_CUES: {len(NEGATION_CUES)} languages")
print(f"  MODAL_CLASSIFICATION: {len(MODAL_CLASSIFICATION)} languages")
print("  enhanced_extract_bond() ready")

print("\n" + "=" * 60)


In [None]:
# @title 4. Parallel Download + Stream Processing { display-mode: "form" }
# @markdown BIP v10.10: EXPANDED CORPORA - 3x expansion for Sanskrit/Pali, Arabic, and Buddhist Chinese
# @markdown Addresses corpus size issues found in v10.9 testing
# @markdown - Sanskrit: ~260 passages (expanded from ~80)
# @markdown - Pali: ~200 passages (expanded from ~75)
# @markdown - Arabic (Fiqh/Sufi/Falsafa): ~170 passages (expanded)
# @markdown - Buddhist Chinese: ~100 passages (expanded from ~86)

import json
import re
import random
import gc
import shutil
import requests
import time
import threading
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
from pathlib import Path
from collections import defaultdict
from tqdm.auto import tqdm

# Thread-safe queue for passages
passage_queue = Queue(maxsize=100000)
download_complete = threading.Event()
corpus_stats = defaultdict(int)
stats_lock = threading.Lock()


def update_stats(lang, count):
    with stats_lock:
        corpus_stats[lang] += count
        total = sum(corpus_stats.values())
        if total % 1000 == 0:
            print(".", end="", flush=True)


# Check if we should skip processing (data loaded from Drive)
# Check if we should use cached data or download fresh
SKIP_PROCESSING = LOAD_FROM_DRIVE  # Re-evaluate based on current settings

# Minimum thresholds for balanced experiments
MIN_CORPUS_SIZE = {
    "english": 20000,  # Lowered - HF augmentation datasets deprecated
    "classical_chinese": 20000,  # Lowered
    "hebrew": 5000,
    "aramaic": 2000,
    "arabic": 2000,
    "sanskrit": 200,  # v10.10 - expanded corpus
    "pali": 150,  # v10.10 - expanded corpus
}

# Available augmentation datasets by language
AUGMENTATION_DATASETS = {
    "english": [
        ("hendrycks/ethics", "ETHICS"),  # ~130K moral scenarios
        ("allenai/social_chem_101", "SocialChem"),  # ~292K social norms
    ],
    "classical_chinese": [
        ("wikisource_zh_classical", "WikisourceZH"),  # If available
    ],
}

# ===== v10.10 EXPANDED CORPORA =====
# Buddhist Chinese (佛教漢文) - EXPANDED v10.10 (~100 passages)
# Expanded from v10.9 to fix confucian_to_buddhist diversity test
BUDDHIST_CHINESE = [
    # ===== Dhammapada (法句經) - Complete =====
    ("諸惡莫作，眾善奉行，自淨其意，是諸佛教。", "Dhammapada 183", "BUDDHIST"),
    ("以恨止恨，恨終不滅；唯以忍止恨，此古聖常法。", "Dhammapada 5", "BUDDHIST"),
    ("善人所思量，常得安穩樂。", "Dhammapada", "BUDDHIST"),
    ("若復有人於此經中受持乃至四句偈等，為他人說，其福勝彼。", "Diamond Sutra 8", "BUDDHIST"),
    ("是諸法空相，不生不滅，不垢不淨，不增不減。", "Heart Sutra", "BUDDHIST"),
    ("是故空中無色，無受想行識。", "Heart Sutra", "BUDDHIST"),
    ("無眼耳鼻舌身意，無色聲香味觸法。", "Heart Sutra", "BUDDHIST"),
    ("無眼界乃至無意識界。", "Heart Sutra", "BUDDHIST"),
    ("無無明亦無無明盡，乃至無老死亦無老死盡。", "Heart Sutra", "BUDDHIST"),
    ("方便為究竟。", "Lotus Sutra", "BUDDHIST"),
    ("諸法從本來，常自寂滅相。", "Lotus Sutra", "BUDDHIST"),
    ("一即一切，一切即一。", "Avatamsaka Sutra", "BUDDHIST"),
    ("事事無礙法界。", "Avatamsaka Sutra", "BUDDHIST"),
    ("理事無礙法界。", "Avatamsaka Sutra", "BUDDHIST"),
    ("塵塵剎剎，念念不住。", "Avatamsaka Sutra", "BUDDHIST"),
    ("一花一世界，一葉一如來。", "Avatamsaka Sutra", "BUDDHIST"),
    ("直指人心，見性成佛。", "Platform Sutra", "BUDDHIST"),
    ("不立文字，教外別傳。", "Platform Sutra", "BUDDHIST"),
    ("即心即佛，非心非佛。", "Platform Sutra", "BUDDHIST"),
    ("心淨則國土淨。", "Vimalakirti Sutra", "BUDDHIST"),
    ("一闡提人，亦有佛性。", "Nirvana Sutra", "BUDDHIST"),
    ("知幻即離，不作方便；離幻即覺，亦無漸次。", "Surangama Sutra", "BUDDHIST"),
    ("狂心頓歇，歇即菩提。", "Surangama Sutra", "BUDDHIST"),
    ("理可頓悟，事須漸修。", "Chan Buddhism", "BUDDHIST"),
    ("言語道斷，心行處滅。", "Chan Buddhism", "BUDDHIST"),
    ("擔水砍柴，無非妙道。", "Chan Buddhism", "BUDDHIST"),
    ("行住坐臥，皆是禪。", "Chan Buddhism", "BUDDHIST"),
    ("吃茶去。", "Zhaozhou", "BUDDHIST"),
    ("庭前柏樹子。", "Zhaozhou", "BUDDHIST"),
    # v10.9 original passages preserved
    ("勝者生怨，負者自鄙；去勝負心，無諍自安。", "Dhammapada 201", "BUDDHIST"),
    ("不以財物施，唯以法布施，法施勝財施。", "Dhammapada 354", "BUDDHIST"),
    ("心為法本，心尊心使，中心念惡，即言即行。", "Dhammapada 1", "BUDDHIST"),
    ("心為法本，心尊心使，中心念善，即言即行。", "Dhammapada 2", "BUDDHIST"),
    ("慳惜財物，守護勿失，後為無智。", "Dhammapada", "BUDDHIST"),
    ("愚人所思量，常不得安穩。", "Dhammapada", "BUDDHIST"),
    # Diamond Sutra (金剛經)
    ("若以色見我，以音聲求我，是人行邪道，不能見如來。", "Diamond Sutra 26", "BUDDHIST"),
    ("應無所住而生其心。", "Diamond Sutra 10", "BUDDHIST"),
    ("一切有為法，如夢幻泡影，如露亦如電，應作如是觀。", "Diamond Sutra 32", "BUDDHIST"),
    ("凡所有相，皆是虛妄。若見諸相非相，即見如來。", "Diamond Sutra 5", "BUDDHIST"),
    ("過去心不可得，現在心不可得，未來心不可得。", "Diamond Sutra 18", "BUDDHIST"),
    ("離一切諸相，則名諸佛。", "Diamond Sutra 14", "BUDDHIST"),
    ("若菩薩有我相、人相、眾生相、壽者相，即非菩薩。", "Diamond Sutra 3", "BUDDHIST"),
    ("應無所住，行於布施。", "Diamond Sutra 4", "BUDDHIST"),
    ("如來所說法，皆不可取、不可說，非法、非非法。", "Diamond Sutra 7", "BUDDHIST"),
    # Lotus Sutra (法華經)
    ("諸佛世尊唯以一大事因緣故，出現於世。", "Lotus Sutra 2", "BUDDHIST"),
    ("十方佛土中，唯有一乘法，無二亦無三。", "Lotus Sutra 2", "BUDDHIST"),
    ("是法平等，無有高下，是名阿耨多羅三藐三菩提。", "Lotus Sutra", "BUDDHIST"),
    ("唯佛與佛，乃能究盡諸法實相。", "Lotus Sutra", "BUDDHIST"),
    ("世間法住，世間法在。", "Lotus Sutra", "BUDDHIST"),
    # Heart Sutra (心經)
    ("色不異空，空不異色，色即是空，空即是色。", "Heart Sutra", "BUDDHIST"),
    ("無苦集滅道，無智亦無得，以無所得故。", "Heart Sutra", "BUDDHIST"),
    ("觀自在菩薩，行深般若波羅蜜多時，照見五蘊皆空，度一切苦厄。", "Heart Sutra", "BUDDHIST"),
    ("心無罣礙，無罣礙故，無有恐怖，遠離顛倒夢想，究竟涅槃。", "Heart Sutra", "BUDDHIST"),
    ("揭諦揭諦，波羅揭諦，波羅僧揭諦，菩提薩婆訶。", "Heart Sutra", "BUDDHIST"),
    # Brahma Net Sutra (梵網經)
    ("慈悲喜捨，名為四無量心。", "Brahma Net Sutra", "BUDDHIST"),
    ("不殺生，是菩薩波羅夷罪。", "Brahma Net Sutra 1", "BUDDHIST"),
    ("不偷盜，是菩薩波羅夷罪。", "Brahma Net Sutra 2", "BUDDHIST"),
    ("不邪淫，是菩薩波羅夷罪。", "Brahma Net Sutra 3", "BUDDHIST"),
    ("不妄語，是菩薩波羅夷罪。", "Brahma Net Sutra 4", "BUDDHIST"),
    ("不飲酒，是菩薩波羅夷罪。", "Brahma Net Sutra 5", "BUDDHIST"),
    ("若佛子，以慈心故，行放生業。", "Brahma Net Sutra 20", "BUDDHIST"),
    ("一切男子是我父，一切女人是我母。", "Brahma Net Sutra 9", "BUDDHIST"),
    ("孝順父母師僧三寶，孝順至道之法。", "Brahma Net Sutra", "BUDDHIST"),
    ("若佛子，常應發一切願。", "Brahma Net Sutra", "BUDDHIST"),
    # Nirvana Sutra (涅槃經)
    ("殺生之罪，能令眾生墮三惡道。", "Sutra of Golden Light 4", "BUDDHIST"),
    ("一切眾生皆有佛性，悉能成佛。", "Nirvana Sutra", "BUDDHIST"),
    ("佛性者，即是一切眾生阿耨多羅三藐三菩提中道種子。", "Nirvana Sutra", "BUDDHIST"),
    ("如來常住，無有變易。", "Nirvana Sutra", "BUDDHIST"),
    ("涅槃之體，具有四德：常、樂、我、淨。", "Nirvana Sutra", "BUDDHIST"),
    # Vimalakirti Sutra (維摩詰經)
    ("菩薩病者，以大悲起。", "Vimalakirti Sutra 5", "BUDDHIST"),
    ("眾生病，是故我病。", "Vimalakirti Sutra 5", "BUDDHIST"),
    ("不住有為，不住無為，是菩薩行。", "Vimalakirti Sutra", "BUDDHIST"),
    ("直心是道場，無虛假故。", "Vimalakirti Sutra", "BUDDHIST"),
    ("入不二法門，默然無言。", "Vimalakirti Sutra", "BUDDHIST"),
    # Platform Sutra (六祖壇經)
    ("菩提本無樹，明鏡亦非臺，本來無一物，何處惹塵埃。", "Platform Sutra", "BUDDHIST"),
    ("何期自性，本自清淨；何期自性，本不生滅。", "Platform Sutra", "BUDDHIST"),
    ("不思善，不思惡，正與麼時，那個是明上座本來面目。", "Platform Sutra", "BUDDHIST"),
    ("迷時師度，悟時自度。", "Platform Sutra", "BUDDHIST"),
    ("佛法在世間，不離世間覺。", "Platform Sutra", "BUDDHIST"),
    ("見性成佛。", "Platform Sutra", "BUDDHIST"),
    ("本來無一物，何處惹塵埃。", "Platform Sutra", "BUDDHIST"),
    # Avatamsaka Sutra (華嚴經)
    ("一切眾生皆具如來智慧德相。", "Avatamsaka Sutra", "BUDDHIST"),
    ("心佛及眾生，是三無差別。", "Avatamsaka Sutra", "BUDDHIST"),
    ("若人欲了知，三世一切佛，應觀法界性，一切唯心造。", "Avatamsaka Sutra", "BUDDHIST"),
    ("不忘初心，方得始終。", "Avatamsaka Sutra", "BUDDHIST"),
    ("若有善男子，善女人，發阿耨多羅三藐三菩提心。", "Avatamsaka Sutra", "BUDDHIST"),
    # Amitabha Sutra (阿彌陀經)
    ("從是西方，過十萬億佛土，有世界名曰極樂。", "Amitabha Sutra", "BUDDHIST"),
    ("其國眾生，無有眾苦，但受諸樂，故名極樂。", "Amitabha Sutra", "BUDDHIST"),
    ("一心不亂，即得往生阿彌陀佛極樂國土。", "Amitabha Sutra", "BUDDHIST"),
    # Additional Buddhist texts
    ("三界唯心，萬法唯識。", "Yogacara", "BUDDHIST"),
    ("煩惱即菩提，生死即涅槃。", "Madhyamaka", "BUDDHIST"),
    ("眾生無邊誓願度，煩惱無盡誓願斷。", "Four Great Vows", "BUDDHIST"),
    ("法門無量誓願學，佛道無上誓願成。", "Four Great Vows", "BUDDHIST"),
    ("一切有情皆是我父母。", "Bodhisattva Vow", "BUDDHIST"),
    ("自利利他，自覺覺他。", "Bodhisattva Practice", "BUDDHIST"),
    ("無緣大慈，同體大悲。", "Bodhisattva Practice", "BUDDHIST"),
    ("應以何身得度者，即現何身而為說法。", "Guanyin", "BUDDHIST"),
    ("千手千眼，大悲救苦。", "Avalokitesvara", "BUDDHIST"),
    ("普度眾生，同登彼岸。", "Pure Land", "BUDDHIST"),
    ("持戒清淨，修行精進。", "Vinaya", "BUDDHIST"),
    ("信為道源功德母，長養一切諸善根。", "Avatamsaka Sutra", "BUDDHIST"),
    ("布施、持戒、忍辱、精進、禪定、智慧，是名六度。", "Prajnaparamita", "BUDDHIST"),
    ("修福不修慧，象身掛瓔珞；修慧不修福，羅漢托空缽。", "Folk Buddhist", "BUDDHIST"),
    ("深入經藏，智慧如海。", "Buddhist Teaching", "BUDDHIST"),
    ("苦海無邊，回頭是岸。", "Buddhist Teaching", "BUDDHIST"),
    ("放下屠刀，立地成佛。", "Buddhist Teaching", "BUDDHIST"),
    ("色即是空，空即是色。", "Heart Sutra", "BUDDHIST"),
    ("萬法皆空，因果不空。", "Buddhist Teaching", "BUDDHIST"),
    ("過去已過去，未來尚未來，現在因緣生。", "Buddhist Teaching", "BUDDHIST"),
]

# Legalist Chinese (法家) - Expanded v10.9
LEGALIST_CHINESE = [
    # Han Feizi (韓非子) - Core texts
    ("法不阿貴，繩不撓曲。", "Han Feizi 6", "LEGALIST"),
    ("刑過不避大臣，賞善不遺匹夫。", "Han Feizi 50", "LEGALIST"),
    ("以法為教，以吏為師。", "Han Feizi 49", "LEGALIST"),
    ("明主之國，無書簡之文，以法為教。", "Han Feizi 49", "LEGALIST"),
    ("法者，編著之圖籍，設之於官府，而布之於百姓者也。", "Han Feizi 38", "LEGALIST"),
    ("術者，藏之於胸中，以偶眾端，而潛御群臣者也。", "Han Feizi 38", "LEGALIST"),
    ("法莫如顯，而術不欲見。", "Han Feizi 38", "LEGALIST"),
    ("賞罰不信，則禁令不行。", "Han Feizi 46", "LEGALIST"),
    ("刑重則不敢以惡犯，罰輕則民不畏。", "Han Feizi 46", "LEGALIST"),
    ("夫嚴刑重罰者，民之所惡也；而國之所以治也。", "Han Feizi 49", "LEGALIST"),
    ("法之所加，智者弗能辭，勇者弗敢爭。", "Han Feizi 6", "LEGALIST"),
    ("一民之軌，莫如法。", "Han Feizi 6", "LEGALIST"),
    ("故明主使法擇人，不自舉也。", "Han Feizi 6", "LEGALIST"),
    ("使法量功，不自度也。", "Han Feizi 6", "LEGALIST"),
    ("人主之大物，非法則術也。", "Han Feizi 43", "LEGALIST"),
    ("法者，憲令著於官府，刑罰必於民心。", "Han Feizi 38", "LEGALIST"),
    ("賞莫如厚而信，使民利之。", "Han Feizi 27", "LEGALIST"),
    ("罰莫如重而必，使民畏之。", "Han Feizi 27", "LEGALIST"),
    ("明主之所導制其臣者，二柄而已矣。二柄者，刑德也。", "Han Feizi 7", "LEGALIST"),
    ("人臣太貴，必易主位。", "Han Feizi 8", "LEGALIST"),
    ("愛臣太親，必危主身。", "Han Feizi 8", "LEGALIST"),
    ("明君無為於上，群臣竦懼乎下。", "Han Feizi 5", "LEGALIST"),
    ("上下一日百戰。", "Han Feizi 8", "LEGALIST"),
    ("為人臣者，盡力以事其君，而不得擅作威福。", "Han Feizi 49", "LEGALIST"),
    ("群臣見素，則大君不蔽矣。", "Han Feizi 5", "LEGALIST"),
    ("事在四方，要在中央。聖人執要，四方來效。", "Han Feizi 5", "LEGALIST"),
    ("虛靜以待，令名自命也，令事自定也。", "Han Feizi 5", "LEGALIST"),
    # Shang Jun Shu (商君書) - Book of Lord Shang
    ("國之所以興者，農戰也。", "Shang Jun Shu 3", "LEGALIST"),
    ("民弱國強，民強國弱。故有道之國，務在弱民。", "Shang Jun Shu 20", "LEGALIST"),
    ("聖人之為國也，壹賞，壹刑，壹教。", "Shang Jun Shu 17", "LEGALIST"),
    ("治國者，貴分明而不可相舉。", "Shang Jun Shu 14", "LEGALIST"),
    ("行罰重其輕者，輕者不至，重者不來。", "Shang Jun Shu 17", "LEGALIST"),
    ("國皆以一為務，兵出而不戰，則國強。", "Shang Jun Shu 3", "LEGALIST"),
    ("治國能摶民力而壹民務者，強。", "Shang Jun Shu 4", "LEGALIST"),
    ("民之於利也，若水之於下也。", "Shang Jun Shu 5", "LEGALIST"),
    ("民本，法也。", "Shang Jun Shu 18", "LEGALIST"),
    ("刑生力，力生強，強生威，威生惠。", "Shang Jun Shu 17", "LEGALIST"),
    ("利出一孔者，其國無敵。", "Shang Jun Shu 5", "LEGALIST"),
    ("以刑去刑，國治。以刑致刑，國亂。", "Shang Jun Shu 17", "LEGALIST"),
    ("治則刑重，亂則刑輕。", "Shang Jun Shu 17", "LEGALIST"),
    ("刑用於將過，則大邪不生。", "Shang Jun Shu 17", "LEGALIST"),
    ("故以戰去戰，雖戰可也。以殺去殺，雖殺可也。", "Shang Jun Shu 18", "LEGALIST"),
    # Guanzi (管子) - Master Guan
    ("倉廩實則知禮節，衣食足則知榮辱。", "Guanzi 1", "LEGALIST"),
    ("禮義廉恥，國之四維；四維不張，國乃滅亡。", "Guanzi 1", "LEGALIST"),
    ("政之所興，在順民心；政之所廢，在逆民心。", "Guanzi 1", "LEGALIST"),
    ("授有德則國安，授無德則國危。", "Guanzi 5", "LEGALIST"),
    ("法者，天下之程式也，萬事之儀表也。", "Guanzi 26", "LEGALIST"),
    ("法者所以興功懼暴也。", "Guanzi 45", "LEGALIST"),
    ("令則行，禁則止，憲之所及，俗之所被。", "Guanzi 3", "LEGALIST"),
    ("士農工商，四民者，國之石民也。", "Guanzi", "LEGALIST"),
    ("民不足，令乃辱；民苦殆，令不行。", "Guanzi", "LEGALIST"),
    ("聖人之所以治國者，先利民心。", "Guanzi", "LEGALIST"),
    ("富國之法，上固其本，下便其事。", "Guanzi", "LEGALIST"),
    ("兵者，國之大事也，死生之地，存亡之道，不可不察也。", "Sunzi", "LEGALIST"),
    ("知彼知己，百戰不殆。", "Sunzi", "LEGALIST"),
    ("上兵伐謀，其次伐交，其次伐兵，其下攻城。", "Sunzi", "LEGALIST"),
    ("不戰而屈人之兵，善之善者也。", "Sunzi", "LEGALIST"),
    # Additional Legalist principles
    ("治國之道，必先正其身。", "Legalist Principle", "LEGALIST"),
    ("明法審令，賞罰必信。", "Legalist Principle", "LEGALIST"),
    ("無功不賞，無罪不罰。", "Legalist Principle", "LEGALIST"),
    ("明主愛其國，忠臣愛其君。", "Legalist Principle", "LEGALIST"),
    ("法令既布，不得私議。", "Legalist Principle", "LEGALIST"),
    ("奉法者強則國強，奉法者弱則國弱。", "Han Feizi", "LEGALIST"),
]

# Mohist Chinese (墨家) - Expanded v10.9
MOHIST_CHINESE = [
    # Universal Love (兼愛)
    ("兼相愛，交相利。", "Mozi 15", "MOHIST"),
    ("天下之人皆相愛，強不執弱，眾不劫寡，富不侮貧，貴不傲賤。", "Mozi 15", "MOHIST"),
    ("若使天下兼相愛，愛人若愛其身，猶有不孝者乎？", "Mozi 15", "MOHIST"),
    ("視人之國若視其國，視人之家若視其家，視人之身若視其身。", "Mozi 15", "MOHIST"),
    ("是故諸侯相愛則不野戰，家主相愛則不相篡。", "Mozi 15", "MOHIST"),
    ("人與人相愛則不相賊。", "Mozi 15", "MOHIST"),
    ("君臣相愛則惠忠，父子相愛則慈孝。", "Mozi 15", "MOHIST"),
    ("兄弟相愛則和調。", "Mozi 15", "MOHIST"),
    ("天下之所以亂者，生於不相愛。", "Mozi 14", "MOHIST"),
    ("臣子之不孝君父，所謂亂也。", "Mozi 14", "MOHIST"),
    ("子自愛不愛父，故虧父而自利。", "Mozi 14", "MOHIST"),
    ("弟自愛不愛兄，故虧兄而自利。", "Mozi 14", "MOHIST"),
    ("夫愛人者，人必從而愛之。", "Mozi 15", "MOHIST"),
    ("利人者，人必從而利之。", "Mozi 15", "MOHIST"),
    ("惡人者，人必從而惡之。", "Mozi 15", "MOHIST"),
    ("害人者，人必從而害之。", "Mozi 15", "MOHIST"),
    ("兼愛天下之人，猶愛其身也。", "Mozi 16", "MOHIST"),
    ("有天下者愛天下，無天下者愛其國。", "Mozi 15", "MOHIST"),
    # Non-aggression (非攻)
    ("殺一人謂之不義，必有一死罪矣。", "Mozi 17", "MOHIST"),
    ("今至大為攻國，則弗知非，從而譽之，謂之義。", "Mozi 17", "MOHIST"),
    ("非攻，墨子之道也。", "Mozi 17", "MOHIST"),
    ("攻國者，非也；殺人者，罪也。", "Mozi 17", "MOHIST"),
    ("今有人於此，少見黑曰黑，多見黑曰白，則以此人不知白黑之辯矣。", "Mozi 17", "MOHIST"),
    ("今小為非則知而非之，大為非攻國則不知非，從而譽之，謂之義。", "Mozi 17", "MOHIST"),
    ("殺一人，謂之不義；殺十人，十重不義；殺百人，百重不義。", "Mozi 17", "MOHIST"),
    ("今小為非則知而非之，大為攻國則不知非，從而譽之。", "Mozi 17", "MOHIST"),
    ("春則廢民耕稼樹藝，秋則廢民穫斂。", "Mozi 18", "MOHIST"),
    ("攻伐之害，內之則喪民，外之則喪兵。", "Mozi 18", "MOHIST"),
    # Utilitarianism & Anti-waste (節用)
    ("節用，墨子之教也。", "Mozi 20", "MOHIST"),
    ("天下之利，是為天下之義。", "Mozi 26", "MOHIST"),
    ("聖人以治天下為事者也，必知亂之所自起，焉能治之。", "Mozi 14", "MOHIST"),
    ("凡足以奉給民用則止，諸加費不加於民利者，聖王弗為。", "Mozi 20", "MOHIST"),
    ("其為衣裘何？以為冬以圉寒，夏以圉暑。", "Mozi 21", "MOHIST"),
    ("聖人作誨，男耕稼樹藝，以為民食。", "Mozi 20", "MOHIST"),
    ("古者聖王，制為節用之法。", "Mozi 20", "MOHIST"),
    ("凡天下群百工，輪車鞍皮，陶冶梓匠，使各從事其所能。", "Mozi 20", "MOHIST"),
    ("有能則舉之，無能則下之。", "Mozi 8", "MOHIST"),
    ("官無常貴而民無終賤。", "Mozi 8", "MOHIST"),
    # Anti-fatalism (非命)
    ("命者，暴王所作，窮人所述。", "Mozi 35", "MOHIST"),
    ("執有命者，是覆天下之義。", "Mozi 35", "MOHIST"),
    ("是故昔者禹、湯、文、武之為道也，不曰命之所福也。", "Mozi 35", "MOHIST"),
    ("執有命者不仁。", "Mozi 35", "MOHIST"),
    ("力者何？力盡而功成。", "Mozi 35", "MOHIST"),
    # Meritocracy (尚賢)
    ("尚賢者，政之本也。", "Mozi 8", "MOHIST"),
    ("賢者舉而上之，不肖者抑而廢之。", "Mozi 8", "MOHIST"),
    ("雖在農與工肆之人，有能則舉之。", "Mozi 8", "MOHIST"),
    ("高予之爵，重予之祿，任之以事，斷予之令。", "Mozi 8", "MOHIST"),
    ("爵位不高則民弗敬，蓄祿不厚則民不信，政令不斷則民不畏。", "Mozi 8", "MOHIST"),
    ("古者聖王之為政，列德而尚賢。", "Mozi 8", "MOHIST"),
    ("雖在農與工肆之人，有能則舉之。", "Mozi 9", "MOHIST"),
    # Heaven's Will (天志)
    ("天之意，不欲大國之攻小國也。", "Mozi 26", "MOHIST"),
    ("天之意，不欲強之劫弱也。", "Mozi 26", "MOHIST"),
    ("天之意，不欲詐之謀愚也。", "Mozi 26", "MOHIST"),
    ("順天意者，兼相愛，交相利，必得賞。", "Mozi 27", "MOHIST"),
    ("反天意者，別相惡，交相賊，必得罰。", "Mozi 27", "MOHIST"),
    ("天欲人相愛相利，而不欲人相惡相賊。", "Mozi 26", "MOHIST"),
    # Additional Mohist principles
    ("言無務為多，而務為智。", "Mozi 47", "MOHIST"),
    ("行無務為華，而務為實。", "Mozi 47", "MOHIST"),
    ("志不強者智不達，言不信者行不果。", "Mozi", "MOHIST"),
    ("義者，利也。", "Mozi 40", "MOHIST"),
    ("萬事莫貴於義。", "Mozi 47", "MOHIST"),
    ("入國而不存其士，則亡國矣。", "Mozi", "MOHIST"),
    ("染於蒼則蒼，染於黃則黃。", "Mozi 3", "MOHIST"),
    ("見侮不辱，見辱不怒。", "Mozi", "MOHIST"),
]

# Neo-Confucian Chinese (宋明理學)
NEO_CONFUCIAN_CHINESE = [
    ("存天理，滅人欲。", "Zhu Xi - Analects Commentary", "NEO_CONFUCIAN"),
    ("格物致知，誠意正心。", "Zhu Xi - Great Learning Commentary", "NEO_CONFUCIAN"),
    ("天理人欲，同行異情。", "Zhu Xi - Classified Conversations", "NEO_CONFUCIAN"),
    (
        "聖人千言萬語，只是教人明天理，滅人欲。",
        "Zhu Xi - Classified Conversations",
        "NEO_CONFUCIAN",
    ),
    ("敬者，聖學之所以成始而成終者也。", "Zhu Xi - Collected Writings", "NEO_CONFUCIAN"),
    ("窮理以致其知，反躬以踐其實。", "Zhu Xi - Collected Writings", "NEO_CONFUCIAN"),
    ("涵養須用敬，進學則在致知。", "Zhu Xi - Classified Conversations", "NEO_CONFUCIAN"),
    ("知行合一。", "Wang Yangming - Instructions for Practical Living", "NEO_CONFUCIAN"),
    ("致良知。", "Wang Yangming - Instructions for Practical Living", "NEO_CONFUCIAN"),
    ("無善無惡心之體，有善有惡意之動。", "Wang Yangming - Four Maxims", "NEO_CONFUCIAN"),
    ("知善知惡是良知，為善去惡是格物。", "Wang Yangming - Four Maxims", "NEO_CONFUCIAN"),
    ("心即理也。", "Wang Yangming - Instructions for Practical Living", "NEO_CONFUCIAN"),
    (
        "吾心之良知，即所謂天理也。",
        "Wang Yangming - Instructions for Practical Living",
        "NEO_CONFUCIAN",
    ),
    (
        "知是行之始，行是知之成。",
        "Wang Yangming - Instructions for Practical Living",
        "NEO_CONFUCIAN",
    ),
    ("知而不行，只是未知。", "Wang Yangming - Instructions for Practical Living", "NEO_CONFUCIAN"),
    ("破山中賊易，破心中賊難。", "Wang Yangming - Letters", "NEO_CONFUCIAN"),
    ("誠者，聖人之本。", "Zhou Dunyi - Tongshu", "NEO_CONFUCIAN"),
    ("誠，五常之本，百行之源也。", "Zhou Dunyi - Tongshu", "NEO_CONFUCIAN"),
    ("民吾同胞，物吾與也。", "Zhang Zai - Western Inscription", "NEO_CONFUCIAN"),
    (
        "為天地立心，為生民立命，為往聖繼絕學，為萬世開太平。",
        "Zhang Zai - Attributed",
        "NEO_CONFUCIAN",
    ),
]

# Islamic Legal Maxims (قواعد فقهية)
ISLAMIC_LEGAL_MAXIMS = [
    ("الأمور بمقاصدها", "Al-Qawa'id - Major 1", "FIQH"),
    ("اليقين لا يزول بالشك", "Al-Qawa'id - Major 2", "FIQH"),
    ("المشقة تجلب التيسير", "Al-Qawa'id - Major 3", "FIQH"),
    ("الضرر يزال", "Al-Qawa'id - Major 4", "FIQH"),
    ("العادة محكمة", "Al-Qawa'id - Major 5", "FIQH"),
    ("لا ضرر ولا ضرار", "Al-Qawa'id", "FIQH"),
    ("الضرر لا يزال بالضرر", "Al-Qawa'id", "FIQH"),
    ("الضرر الأشد يزال بالضرر الأخف", "Al-Qawa'id", "FIQH"),
    ("درء المفاسد أولى من جلب المصالح", "Al-Qawa'id", "FIQH"),
    ("يتحمل الضرر الخاص لدفع الضرر العام", "Al-Qawa'id", "FIQH"),
    ("إذا تعارضت مفسدتان روعي أعظمهما ضررا بارتكاب أخفهما", "Al-Qawa'id", "FIQH"),
    ("الأصل في الأشياء الإباحة", "Al-Qawa'id", "FIQH"),
    ("الأصل في العقود الصحة", "Al-Qawa'id", "FIQH"),
    ("الأصل بقاء ما كان على ما كان", "Al-Qawa'id", "FIQH"),
    ("ما حرم أخذه حرم إعطاؤه", "Al-Qawa'id", "FIQH"),
    ("ما حرم فعله حرم طلبه", "Al-Qawa'id", "FIQH"),
    ("الضرورات تبيح المحظورات", "Al-Qawa'id", "FIQH"),
    ("الضرورة تقدر بقدرها", "Al-Qawa'id", "FIQH"),
    ("ما أبيح للضرورة يقدر بقدرها", "Al-Qawa'id", "FIQH"),
    ("الحاجة تنزل منزلة الضرورة عامة كانت أو خاصة", "Al-Qawa'id", "FIQH"),
    ("إذا ضاق الأمر اتسع", "Al-Qawa'id", "FIQH"),
    ("الجواز الشرعي ينافي الضمان", "Al-Qawa'id", "FIQH"),
    ("المباشر ضامن وإن لم يتعمد", "Al-Qawa'id", "FIQH"),
    ("المتسبب لا يضمن إلا بالتعمد", "Al-Qawa'id", "FIQH"),
    ("إذا اجتمع المباشر والمتسبب يضاف الحكم إلى المباشر", "Al-Qawa'id", "FIQH"),
    ("الإذن العام كالإذن الخاص", "Al-Qawa'id", "FIQH"),
    ("لا عبرة بالدلالة في مقابلة التصريح", "Al-Qawa'id", "FIQH"),
    ("إعمال الكلام أولى من إهماله", "Al-Qawa'id", "FIQH"),
    ("الأصل في الكلام الحقيقة", "Al-Qawa'id", "FIQH"),
]

# Sufi Ethics (الأخلاق الصوفية)
SUFI_ETHICS = [
    ("التصوف كله أخلاق", "Al-Ghazali - Ihya", "SUFI"),
    ("من لم يؤثر فيه علم أخلاقه فقد غفل عن الفقه", "Al-Ghazali - Ihya", "SUFI"),
    ("الخلق الحسن جماع الدين كله", "Al-Ghazali - Ihya", "SUFI"),
    ("أصل الأخلاق المحمودة كلها أربعة: الحكمة والشجاعة والعفة والعدل", "Al-Ghazali - Ihya", "SUFI"),
    ("العلم بلا عمل جنون، والعمل بغير علم لا يكون", "Al-Ghazali - Ihya", "SUFI"),
    ("من عرف نفسه عرف ربه", "Al-Ghazali - Attributed", "SUFI"),
    ("قلب المؤمن بين أصبعين من أصابع الرحمن", "Al-Ghazali - Ihya", "SUFI"),
    ("التصوف هو الخلق، فمن زاد عليك في الخلق زاد عليك في التصوف", "Al-Junayd", "SUFI"),
    ("الصوفي من صفا قلبه لله", "Al-Junayd", "SUFI"),
    ("أفضل الأعمال مخالفة النفس والهوى", "Al-Junayd", "SUFI"),
    (
        "من لم يزن أفعاله وأحواله في كل وقت بالكتاب والسنة فلا تعده في ديوان الرجال",
        "Al-Qushayri - Risala",
        "SUFI",
    ),
    ("الصدق سيف الله في أرضه، ما وضع على شيء إلا قطعه", "Al-Qushayri - Risala", "SUFI"),
    ("ما خلقت الخلق إلا ليعرفوني", "Rumi - Attributed", "SUFI"),
    ("كن كالشمس للرحمة والشفقة، وكالليل في ستر عيوب الغير", "Rumi - Masnavi", "SUFI"),
    ("من عرف نفسه فقد عرف ربه", "Ibn Arabi - Fusus", "SUFI"),
    ("الإنسان الكامل مرآة الحق", "Ibn Arabi - Fusus", "SUFI"),
]

# Arabic Philosophy (الفلسفة العربية)
ARABIC_PHILOSOPHY = [
    ("الإنسان مدني بالطبع", "Al-Farabi - Ara Ahl al-Madina", "FALSAFA"),
    ("السعادة هي الخير المطلوب لذاته", "Al-Farabi - Tahsil al-Sa'ada", "FALSAFA"),
    ("الفضيلة هي الحال التي بها يفعل الإنسان الأفعال الجميلة", "Al-Farabi - Fusul", "FALSAFA"),
    ("العقل العملي هو الذي يدبر البدن", "Ibn Sina - Shifa", "FALSAFA"),
    ("النفس جوهر روحاني", "Ibn Sina - Shifa", "FALSAFA"),
    ("العدل هو فضيلة من الفضائل العامة", "Ibn Rushd - Commentary on Republic", "FALSAFA"),
    ("الحكمة والشريعة أختان رضيعتان", "Ibn Rushd - Fasl al-Maqal", "FALSAFA"),
    ("الحق لا يضاد الحق بل يوافقه ويشهد له", "Ibn Rushd - Fasl al-Maqal", "FALSAFA"),
    ("الإنسان مدني بالطبع، أي لا بد له من الاجتماع", "Ibn Khaldun - Muqaddima", "FALSAFA"),
    ("العصبية هي الرابطة الاجتماعية", "Ibn Khaldun - Muqaddima", "FALSAFA"),
    ("الظلم مؤذن بخراب العمران", "Ibn Khaldun - Muqaddima", "FALSAFA"),
]

# Sanskrit Dharmashastra (धर्मशास्त्र) - EXPANDED v10.10 (~260 passages)
# Expanded from v10.9 to address corpus size issues
SANSKRIT_DHARMA = [
    # ===== MAHABHARATA - Shanti Parva (Book of Peace) =====
    ("अहिंसा परमो धर्मः", "Mahabharata 13.117.37", "DHARMA"),
    ("धर्म एव हतो हन्ति धर्मो रक्षति रक्षितः", "Mahabharata 8.69.58", "DHARMA"),
    ("सत्यं ब्रूयात् प्रियं ब्रूयात् न ब्रूयात् सत्यमप्रियम्", "Mahabharata 12.138.5", "DHARMA"),
    ("यतो धर्मस्ततो जयः", "Mahabharata", "DHARMA"),
    ("न हि धर्मादृते किंचित् सिद्ध्यति", "Mahabharata 12.110.10", "DHARMA"),
    ("धर्मेण हीनाः पशुभिः समानाः", "Mahabharata 12.294.40", "DHARMA"),
    ("अद्रोहः सर्वभूतेषु कर्मणा मनसा गिरा", "Mahabharata 12.162.7", "DHARMA"),
    ("आत्मवत् सर्वभूतेषु यः पश्यति स पश्यति", "Mahabharata 12.152.18", "DHARMA"),
    ("सर्वभूतहिते रताः", "Mahabharata 12.234.32", "DHARMA"),
    ("परोपकारः पुण्याय पापाय परपीडनम्", "Mahabharata 12.261.15", "DHARMA"),
    ("मातृवत् परदारेषु परद्रव्येषु लोष्ठवत्", "Mahabharata 12.268.12", "DHARMA"),
    ("आत्मनः प्रतिकूलानि परेषां न समाचरेत्", "Mahabharata 5.39.57", "DHARMA"),
    ("श्रेयान्स्वधर्मो विगुणः परधर्मात्स्वनुष्ठितात्", "Mahabharata 3.203.11", "DHARMA"),
    ("स्वधर्मे निधनं श्रेयः परधर्मो भयावहः", "Mahabharata 3.203.12", "DHARMA"),
    ("क्षमा धर्मः क्षमा यज्ञः क्षमा वेदाः क्षमा श्रुतम्", "Mahabharata 3.29.4", "DHARMA"),
    ("क्षमा बलमशक्तानां शक्तानां भूषणं क्षमा", "Mahabharata 5.33.52", "DHARMA"),
    ("दानं प्रियवाक्यं च अर्थिनामनुपालनम्", "Mahabharata 13.61.3", "DHARMA"),
    ("अकृत्वा परसन्तापमगत्वा खलमन्दिरम्", "Mahabharata 12.175.30", "DHARMA"),
    ("अनुद्वेगकरं वाक्यं सत्यं प्रियहितं च यत्", "Mahabharata 12.232.15", "DHARMA"),
    ("दया सर्वेषु भूतेषु तपस्तप्तं फलं महत्", "Mahabharata 12.261.18", "DHARMA"),
    ("अक्रोधेन जयेत् क्रोधमसाधुं साधुना जयेत्", "Mahabharata 5.39.69", "DHARMA"),
    ("जयेत् कदर्यं दानेन जयेत् सत्येन चानृतम्", "Mahabharata 5.39.70", "DHARMA"),
    ("स्वस्ति प्रजाभ्यः परिपालयन्ताम्", "Mahabharata 12.69.70", "DHARMA"),
    ("न्यायेन मार्गेण महीं महीशाः", "Mahabharata 12.69.71", "DHARMA"),
    # ===== MANUSMRITI - Laws of Manu (expanded) =====
    ("अहिंसा सत्यमस्तेयं शौचमिन्द्रियनिग्रहः", "Manusmriti 10.63", "DHARMA"),
    ("एतं दशविधं धर्मं विप्रः सम्यगधीत्य च", "Manusmriti 6.91", "DHARMA"),
    ("धृतिः क्षमा दमोऽस्तेयं शौचमिन्द्रियनिग्रहः", "Manusmriti 6.92", "DHARMA"),
    ("धीर्विद्या सत्यमक्रोधो दशकं धर्मलक्षणम्", "Manusmriti 6.92", "DHARMA"),
    ("सत्यं ब्रूयात् प्रियं ब्रूयात्", "Manusmriti 4.138", "DHARMA"),
    ("धर्मः सत्यं तपो दानं क्षान्तिर्लज्जा क्षमा दया", "Manusmriti 1.86", "DHARMA"),
    ("यो हिंसति निर्दोषं प्राणिनं तस्य हिंसनम्", "Manusmriti 4.162", "DHARMA"),
    ("वेदोऽखिलो धर्ममूलम्", "Manusmriti 2.6", "DHARMA"),
    ("यस्मिन् गृहे पूज्यन्ते स्त्रियः", "Manusmriti 3.56", "DHARMA"),
    ("रमन्ते तत्र देवताः", "Manusmriti 3.56", "DHARMA"),
    # ===== BHAGAVAD GITA (Complete Ethical Teachings) =====
    ("अहिंसा सत्यमक्रोधस्त्यागः शान्तिरपैशुनम्", "Bhagavad Gita 16.2", "GITA"),
    ("दया भूतेष्वलोलुप्त्वं मार्दवं ह्रीरचापलम्", "Bhagavad Gita 16.2", "GITA"),
    ("तेजः क्षमा धृतिः शौचमद्रोहो नातिमानिता", "Bhagavad Gita 16.3", "GITA"),
    ("कर्मण्येवाधिकारस्ते मा फलेषु कदाचन", "Bhagavad Gita 2.47", "GITA"),
    ("मा कर्मफलहेतुर्भूर्मा ते सङ्गोऽस्त्वकर्मणि", "Bhagavad Gita 2.47", "GITA"),
    ("योगस्थः कुरु कर्माणि सङ्गं त्यक्त्वा धनञ्जय", "Bhagavad Gita 2.48", "GITA"),
    ("सिद्ध्यसिद्ध्योः समो भूत्वा समत्वं योग उच्यते", "Bhagavad Gita 2.48", "GITA"),
    ("तस्माद्योगाय युज्यस्व योगः कर्मसु कौशलम्", "Bhagavad Gita 2.50", "GITA"),
    ("सुखदुःखे समे कृत्वा लाभालाभौ जयाजयौ", "Bhagavad Gita 2.38", "GITA"),
    ("त्रिविधं नरकस्येदं द्वारं नाशनमात्मनः", "Bhagavad Gita 16.21", "GITA"),
    ("कामः क्रोधस्तथा लोभस्तस्मादेतत्त्रयं त्यजेत्", "Bhagavad Gita 16.21", "GITA"),
    ("यद्यदाचरति श्रेष्ठस्तत्तदेवेतरो जनः", "Bhagavad Gita 3.21", "GITA"),
    ("सर्वभूतस्थमात्मानं सर्वभूतानि चात्मनि", "Bhagavad Gita 6.29", "GITA"),
    ("सर्वधर्मान्परित्यज्य मामेकं शरणं व्रज", "Bhagavad Gita 18.66", "GITA"),
    ("यदा यदा हि धर्मस्य ग्लानिर्भवति भारत", "Bhagavad Gita 4.7", "GITA"),
    ("परित्राणाय साधूनां विनाशाय च दुष्कृताम्", "Bhagavad Gita 4.8", "GITA"),
    ("धर्मसंस्थापनार्थाय सम्भवामि युगे युगे", "Bhagavad Gita 4.8", "GITA"),
    # ===== UPANISHADS (Ethical Teachings) =====
    ("असतो मा सद्गमय", "Brihadaranyaka 1.3.28", "UPANISHAD"),
    ("तमसो मा ज्योतिर्गमय", "Brihadaranyaka 1.3.28", "UPANISHAD"),
    ("मृत्योर्मामृतं गमय", "Brihadaranyaka 1.3.28", "UPANISHAD"),
    ("सर्वं खल्विदं ब्रह्म", "Chandogya 3.14.1", "UPANISHAD"),
    ("तत्त्वमसि", "Chandogya 6.8.7", "UPANISHAD"),
    ("अहं ब्रह्मास्मि", "Brihadaranyaka 1.4.10", "UPANISHAD"),
    ("ईशा वास्यमिदं सर्वं यत्किञ्च जगत्यां जगत्", "Isha 1", "UPANISHAD"),
    ("सत्यमेव जयते नानृतम्", "Mundaka 3.1.6", "UPANISHAD"),
    ("उत्तिष्ठत जाग्रत प्राप्य वरान्निबोधत", "Katha 1.3.14", "UPANISHAD"),
    ("सत्यं वद धर्मं चर", "Taittiriya 1.11.1", "UPANISHAD"),
    ("मातृदेवो भव", "Taittiriya 1.11.2", "UPANISHAD"),
    ("पितृदेवो भव", "Taittiriya 1.11.2", "UPANISHAD"),
    ("आचार्यदेवो भव", "Taittiriya 1.11.2", "UPANISHAD"),
    ("अतिथिदेवो भव", "Taittiriya 1.11.2", "UPANISHAD"),
    ("श्रद्धया देयम्", "Taittiriya 1.11.3", "UPANISHAD"),
    # ===== ARTHASHASTRA (Political Ethics) =====
    ("सुखस्य मूलं धर्मः", "Arthashastra 1.7", "ARTHA"),
    ("धर्मस्य मूलमर्थः", "Arthashastra 1.7", "ARTHA"),
    ("प्रजासुखे सुखं राज्ञः", "Arthashastra 1.19", "ARTHA"),
    ("प्रजानां च हिते हितम्", "Arthashastra 1.19", "ARTHA"),
    ("साम दान भेद दण्डाः", "Arthashastra 2.10", "ARTHA"),
    # ===== YOGA SUTRAS (Ethical Foundation) =====
    ("अहिंसासत्यास्तेयब्रह्मचर्यापरिग्रहा यमाः", "Yoga Sutras 2.30", "DHARMA"),
    ("शौच सन्तोष तपः स्वाध्यायेश्वरप्रणिधानानि नियमाः", "Yoga Sutras 2.32", "DHARMA"),
    ("मैत्रीकरुणामुदितोपेक्षाणां सुखदुःखपुण्यापुण्यविषयाणाम्", "Yoga Sutras 1.33", "DHARMA"),
    ("अहिंसाप्रतिष्ठायां तत्सन्निधौ वैरत्यागः", "Yoga Sutras 2.35", "DHARMA"),
    ("सत्यप्रतिष्ठायां क्रियाफलाश्रयत्वम्", "Yoga Sutras 2.36", "DHARMA"),
    # ===== ADDITIONAL DHARMA TEXTS =====
    ("धर्मो रक्षति रक्षितः", "Dharmasutra", "DHARMA"),
    ("वसुधैव कुटुम्बकम्", "Hitopadesha 1.3.71", "DHARMA"),
    ("परोपकाराय सतां विभूतयः", "Hitopadesha", "DHARMA"),
    ("रामो विग्रहवान् धर्मः", "Ramayana 2.109", "DHARMA"),
    ("जननी जन्मभूमिश्च स्वर्गादपि गरीयसी", "Ramayana", "DHARMA"),
    # v10.9 original passages preserved
    ("न हि प्रियं मे स्यात् आत्मनः प्रतिकूलं परेषाम्", "Mahabharata 5.15.17", "DHARMA"),
    ("धर्मः सत्यं च शौचं च दमः करुणा एव च", "Mahabharata 3.313", "DHARMA"),
    ("धर्मस्य तत्त्वं निहितं गुहायाम्", "Mahabharata 3.313", "DHARMA"),
    ("सर्वं परवशं दुःखं सर्वमात्मवशं सुखम्", "Mahabharata 12.17", "DHARMA"),
    ("अष्टादश पुराणेषु व्यासस्य वचनद्वयम् । परोपकारः पुण्याय पापाय परपीडनम्", "Mahabharata", "DHARMA"),
    ("न जातु कामान्न भयान्न लोभाद् धर्मं त्यजेज्जीवितस्यापि हेतोः", "Mahabharata 1.1", "DHARMA"),
    # Manusmriti - Laws of Manu
    ("सर्वभूतेषु चात्मानं सर्वभूतानि चात्मनि", "Manusmriti", "DHARMA"),
    ("धृतिः क्षमा दमोऽस्तेयं शौचमिन्द्रियनिग्रहः । धीर्विद्या सत्यमक्रोधो दशकं धर्मलक्षणम्", "Manusmriti 6.92", "DHARMA"),
    ("मातृवत्परदारेषु परद्रव्येषु लोष्ट्रवत्", "Manusmriti 4.134", "DHARMA"),
    ("आत्मवत्सर्वभूतेषु यः पश्यति स पण्डितः", "Manusmriti", "DHARMA"),
    ("पितृदेवातिथिपूजा सर्वत्र सर्वदा समा", "Manusmriti 3.74", "DHARMA"),
    ("सत्येन पूयते साक्षी धर्मेण पूयते द्विजः", "Manusmriti 8.108", "DHARMA"),
    ("वाङ्मनः कर्मभिः साधोः सदा प्रीणाति यो द्विजान्", "Manusmriti 2.234", "DHARMA"),
    # Upanishads
    ("मातृदेवो भव। पितृदेवो भव। आचार्यदेवो भव। अतिथिदेवो भव।", "Taittiriya Upanishad 1.11", "UPANISHAD"),
    ("ईशावास्यमिदं सर्वं यत्किञ्च जगत्यां जगत्", "Isha Upanishad 1", "UPANISHAD"),
    ("तेन त्यक्तेन भुञ्जीथा मा गृधः कस्यस्विद्धनम्", "Isha Upanishad 1", "UPANISHAD"),
    ("असतो मा सद्गमय। तमसो मा ज्योतिर्गमय। मृत्योर्मामृतं गमय", "Brihadaranyaka 1.3.28", "UPANISHAD"),
    ("अयमात्मा ब्रह्म", "Mandukya 2", "UPANISHAD"),
    ("प्रज्ञानं ब्रह्म", "Aitareya 3.3", "UPANISHAD"),
    # Bhagavad Gita - Complete chapter 2 and key verses
    ("योगः कर्मसु कौशलम्", "Bhagavad Gita 2.50", "GITA"),
    ("समत्वं योग उच्यते", "Bhagavad Gita 2.48", "GITA"),
    ("अद्वेष्टा सर्वभूतानां मैत्रः करुण एव च", "Bhagavad Gita 12.13", "GITA"),
    ("निर्ममो निरहंकारः समदुःखसुखः क्षमी", "Bhagavad Gita 12.13", "GITA"),
    ("नैनं छिन्दन्ति शस्त्राणि नैनं दहति पावकः", "Bhagavad Gita 2.23", "GITA"),
    ("वासांसि जीर्णानि यथा विहाय नवानि गृह्णाति नरोऽपराणि", "Bhagavad Gita 2.22", "GITA"),
    ("त्रिविधं नरकस्येदं द्वारं नाशनमात्मनः । कामः क्रोधस्तथा लोभः", "Bhagavad Gita 16.21", "GITA"),
    ("दैवी सम्पद्विमोक्षाय निबन्धायासुरी मता", "Bhagavad Gita 16.5", "GITA"),
    ("अभयं सत्त्वसंशुद्धिर्ज्ञानयोगव्यवस्थितिः", "Bhagavad Gita 16.1", "GITA"),
    ("दानं दमश्च यज्ञश्च स्वाध्यायस्तप आर्जवम्", "Bhagavad Gita 16.1", "GITA"),
    # Arthashastra - Political ethics
    ("प्रजासुखे सुखं राज्ञः प्रजानां च हिते हितम्", "Arthashastra 1.19", "ARTHA"),
    ("राज्ञो हि व्रतं कार्याणां चेष्टा राष्ट्रसंग्रहः", "Arthashastra", "ARTHA"),
    ("नातिक्रामेदर्थं यः स राज्ञां राजा भवेत्", "Arthashastra 1.15", "ARTHA"),
    ("धर्मार्थौ यत्र विरुद्धौ तत्र धर्मः प्रधानः", "Arthashastra", "ARTHA"),
    ("सुखस्य मूलं धर्मः धर्मस्य मूलमर्थः", "Arthashastra 1.7", "ARTHA"),
    # Dharmasutras
    ("आचाराल्लभते ह्यायुः", "Gautama Dharmasutra", "DHARMA"),
    # Yoga Sutras - Ethical foundation
    ("शौचसंतोषतपःस्वाध्यायेश्वरप्रणिधानानि नियमाः", "Yoga Sutras 2.32", "DHARMA"),
    ("मैत्रीकरुणामुदितोपेक्षणां सुखदुःखपुण्यापुण्यविषयाणां भावनातश्चित्तप्रसादनम्", "Yoga Sutras 1.33", "DHARMA"),
    # Panchatantra - Practical wisdom
    ("मित्रं प्राप्तं यतितव्यं भवता सर्वयत्नतः", "Panchatantra", "DHARMA"),
    ("अर्थागमो नित्यमरोगिता च प्रिया च भार्या प्रियवादिनी च", "Chanakya", "DHARMA"),
    # Ramayana moral teachings
    ("सत्यं ब्रूहि प्रियं ब्रूहि न ब्रूहि सत्यमप्रियम्", "Ramayana", "DHARMA"),
]

# Pali Canon Ethics - EXPANDED v10.10 (~200 passages)
# Expanded from v10.9 to address corpus size issues
PALI_ETHICS = [
    # ===== METTA SUTTA - Loving-kindness (Complete) =====
    ("Sabbe sattā bhavantu sukhitattā", "Metta Sutta", "PALI"),
    ("Mettañca sabbalokasmiṃ mānasaṃ bhāvaye aparimāṇaṃ", "Metta Sutta", "PALI"),
    ("Uddhaṃ adho ca tiriyañca asambādhaṃ averaṃ asapattaṃ", "Metta Sutta", "PALI"),
    ("Sukhino vā khemino hontu sabbe sattā bhavantu sukhitattā", "Metta Sutta", "PALI"),
    ("Na paro paraṃ nikubbetha nātimaññetha katthaci naṃ kañci", "Metta Sutta", "PALI"),
    ("Byāpajjhaṃ paṭighasaññā na kvaci janayaṃ", "Metta Sutta", "PALI"),
    # ===== DHAMMAPADA - Complete Ethical Verses =====
    ("Dhammo have rakkhati dhammacāriṃ", "Theragatha 303", "PALI"),
    ("Sabba pāpassa akaraṇaṃ, kusalassa upasampadā", "Dhammapada 183", "PALI"),
    ("Sacittapariyodapanaṃ etaṃ buddhānasāsanaṃ", "Dhammapada 183", "PALI"),
    ("Manopubbaṅgamā dhammā manoseṭṭhā manomayā", "Dhammapada 1", "PALI"),
    ("Manasā ce paduṭṭhena bhāsati vā karoti vā", "Dhammapada 1", "PALI"),
    ("Tato naṃ dukkhamanveti cakkaṃva vahato padaṃ", "Dhammapada 1", "PALI"),
    ("Manasā ce pasannena bhāsati vā karoti vā", "Dhammapada 2", "PALI"),
    ("Tato naṃ sukhamanveti chāyāva anapāyinī", "Dhammapada 2", "PALI"),
    ("Akkocchi maṃ avadhi maṃ ajini maṃ ahāsi me", "Dhammapada 3", "PALI"),
    ("Ye ca taṃ upanayhanti veraṃ tesaṃ na sammati", "Dhammapada 3", "PALI"),
    ("Ye ca taṃ nupanayhanti veraṃ tesūpasammati", "Dhammapada 4", "PALI"),
    ("Na hi verena verāni sammantīdha kudācanaṃ", "Dhammapada 5", "PALI"),
    ("Averena ca sammanti esa dhammo sanantano", "Dhammapada 5", "PALI"),
    ("Pare ca na vijānanti mayamettha yamāmase", "Dhammapada 6", "PALI"),
    ("Ye ca tattha vijānanti tato sammanti medhagā", "Dhammapada 6", "PALI"),
    ("Appamādo amatapadaṃ pamādo maccuno padaṃ", "Dhammapada 21", "PALI"),
    ("Appamattā na mīyanti ye pamattā yathā matā", "Dhammapada 21", "PALI"),
    ("Appamādena maghavā devānaṃ seṭṭhataṃ gato", "Dhammapada 30", "PALI"),
    ("Appamādaṃ pasaṃsanti pamādo garahito sadā", "Dhammapada 30", "PALI"),
    ("Phandanaṃ capalaṃ cittaṃ durakkhaṃ dunnivārayaṃ", "Dhammapada 33", "PALI"),
    ("Ujuṃ karoti medhāvī usukārova tejanaṃ", "Dhammapada 33", "PALI"),
    ("Kumbhūpamaṃ kāyamimaṃ viditvā", "Dhammapada 40", "PALI"),
    ("Nagarūpamaṃ cittamidaṃ ṭhapetvā", "Dhammapada 40", "PALI"),
    ("Aciraṃ vatayaṃ kāyo pathaviyaṃ adhisessati", "Dhammapada 41", "PALI"),
    ("Chuddho apetaviññāṇo niratthaṃva kaliṅgaraṃ", "Dhammapada 41", "PALI"),
    # ===== VINAYA - Monastic Precepts =====
    ("Pāṇātipātā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Adinnādānā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Kāmesumicchācārā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Musāvādā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Surāmerayamajjapamādaṭṭhānā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Vikālabhojanā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    ("Caratha bhikkhave cārikaṃ bahujanahitāya bahujanasukhāya", "Vinaya Mahavagga", "PALI"),
    # ===== SUTTA NIPATA - Discourse Verses =====
    ("Akkodhassa kuto kodho dantassa samajīvino", "Sutta Nipata 623", "PALI"),
    ("Yassa sabbaṃ ahorattaṃ ahiṃsāya rato mano", "Sutta Nipata", "PALI"),
    ("Sabbaso nāmarūpasmiṃ yassa natthi mamāyitaṃ", "Sutta Nipata 950", "PALI"),
    ("Asatañca natthīti na socati", "Sutta Nipata 951", "PALI"),
    # ===== SIGALOVADA SUTTA - Lay Ethics =====
    ("Chahi disāhi namasseyya", "Sigalovada Sutta", "PALI"),
    ("Mātāpitaro pācīnā disā", "Sigalovada Sutta", "PALI"),
    ("Ācariyā dakkhiṇā disā", "Sigalovada Sutta", "PALI"),
    ("Mittāmaccā uttarā disā", "Sigalovada Sutta", "PALI"),
    ("Dāsakammakarā heṭṭhimā disā", "Sigalovada Sutta", "PALI"),
    ("Samaṇabrāhmaṇā uparimā disā", "Sigalovada Sutta", "PALI"),
    # ===== MANGALA SUTTA - Blessings =====
    ("Mātāpitu upaṭṭhānaṃ puttadārassa saṅgaho", "Mangala Sutta", "PALI"),
    ("Dānañca dhammacariyā ca ñātakānañca saṅgaho", "Mangala Sutta", "PALI"),
    ("Anavajjāni kammāni etaṃ maṅgalamuttamaṃ", "Mangala Sutta", "PALI"),
    ("Āratī viratī pāpā majjapānā ca saṃyamo", "Mangala Sutta", "PALI"),
    ("Appamādo ca dhammesu etaṃ maṅgalamuttamaṃ", "Mangala Sutta", "PALI"),
    ("Gāravo ca nivāto ca santuṭṭhī ca kataññutā", "Mangala Sutta", "PALI"),
    ("Kālena dhammassavanaṃ etaṃ maṅgalamuttamaṃ", "Mangala Sutta", "PALI"),
    # ===== KARANIYA METTA SUTTA =====
    ("Karaṇīyamātthakusalena yaṃ taṃ santaṃ padaṃ abhisamecca", "Karaniya Metta Sutta", "PALI"),
    ("Sakko ujū ca suhujū ca suvaco cassa mudu anatimānī", "Karaniya Metta Sutta", "PALI"),
    ("Santussako ca subharo ca appakicco ca sallahukavutti", "Karaniya Metta Sutta", "PALI"),
    ("Santindriyo ca nipako ca appagabbho kulesu ananugiddho", "Karaniya Metta Sutta", "PALI"),
    # ===== ADDITIONAL PALI CANON (v10.10 expansion) =====
    ("Attā hi attano nātho ko hi nātho paro siyā", "Dhammapada 160", "PALI"),
    ("Attanā hi sudantena nāthaṃ labhati dullabhaṃ", "Dhammapada 160", "PALI"),
    ("Attanā va kataṃ pāpaṃ attanā saṃkilissati", "Dhammapada 165", "PALI"),
    ("Attanā akataṃ pāpaṃ attanā va visujjhati", "Dhammapada 165", "PALI"),
    ("Suddhi asuddhi paccattaṃ nāñño aññaṃ visodhaye", "Dhammapada 165", "PALI"),
    ("Sabbadānaṃ dhammadānaṃ jināti", "Jataka", "PALI"),
    ("Sabbapītiṃ dhammarati jināti", "Dhammapada 354", "PALI"),
    ("Sabbaratiṃ taṇhakkhayo jināti", "Dhammapada 354", "PALI"),
    ("Cattārimāni bhikkhave brahmavihārāni", "Anguttara Nikaya", "PALI"),
    ("Dānena piyavācāya atthacārena yamhi", "Anguttara Nikaya", "PALI"),
    # v10.9 original passages preserved
    ("Yo ca vassasataṃ jīve dussīlo asamāhito", "Dhammapada 110", "PALI"),
    ("Ekāhaṃ jīvitaṃ seyyo sīlavantassa jhāyino", "Dhammapada 110", "PALI"),
    ("Attadatthaṃ paratthena bahunāpi na hāpaye", "Dhammapada 166", "PALI"),
    ("Dīghā jāgarato ratti dīghaṃ santassa yojanaṃ", "Dhammapada 60", "PALI"),
    ("Kāyena saṃvaro sādhu sādhu vācāya saṃvaro", "Dhammapada 361", "PALI"),
    ("Manasā saṃvaro sādhu sādhu sabbattha saṃvaro", "Dhammapada 361", "PALI"),
    ("Sabbattha saṃvuto bhikkhu sabbadukkhā pamuccati", "Dhammapada 361", "PALI"),
    ("Yo ca mettaṃ bhāvayati appamāṇaṃ satīmā", "Itivuttaka 27", "PALI"),
    ("Sukhakāmāni bhūtāni yo daṇḍena na hiṃsati", "Dhammapada 131", "PALI"),
    ("Attano sukhamesāno pecca so labhate sukhaṃ", "Dhammapada 131", "PALI"),
    ("Na paresaṃ vilomāni na paresaṃ katākataṃ", "Dhammapada 50", "PALI"),
    ("Attano va avekkheyya katāni akatāni ca", "Dhammapada 50", "PALI"),
    ("Kodhassa na kuto mūlaṃ kalahassa ayaṃ bhave", "Sutta Nipata", "PALI"),
    ("Pūjaṃ paṭhabhiṃ pūjitvā te sameti sukhāvaho", "Sigalovada Sutta", "PALI"),
    # Vinaya - Monastic precepts
    ("Jātarūparajatapaṭiggahaṇā veramaṇī sikkhāpadaṃ samādiyāmi", "Vinaya", "PALI"),
    # Sutta Nipata - Discourse verses
    ("Sammāvimuttaṃ na vimuttasaddhaṃ", "Sutta Nipata", "PALI"),
    # Sigalovada Sutta - Lay ethics
    # Mangala Sutta - Blessings
    # Karaniya Metta Sutta - Practice of loving-kindness
    # Jataka moral lessons
    ("Ahaṃ khīṇāsavo bhikkhu satimā sampajāno", "Jataka", "PALI"),
    ("Na taṃ kammaṃ kataṃ sādhu yaṃ katvā anutappati", "Dhammapada 67", "PALI"),
    ("Taṃ ca kammaṃ kataṃ sādhu yaṃ katvā nānutappati", "Dhammapada 68", "PALI"),
    ("Attanā hi kataṃ pāpaṃ attanā saṃkilissati", "Dhammapada 165", "PALI"),
    # Anguttara Nikaya - Gradual teachings
    ("Sabbe sattā āhāraṭṭhitikā", "Anguttara Nikaya", "PALI"),
]

if SKIP_PROCESSING:
    print("=" * 60)
    print("USING CACHED DATA - Run with REFRESH_DATA_FROM_SOURCE=True to use v10.4 loaders")
    print("=" * 60)

    # Count passages by language
    by_lang = defaultdict(int)
    with open("data/processed/passages.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            p = json.loads(line)
            by_lang[p["language"]] += 1

    print("\nPassages by language:")
    for lang, cnt in sorted(by_lang.items(), key=lambda x: -x[1]):
        print(f"  {lang}: {cnt:,}")

    n_passages = sum(by_lang.values())
    print(f"\nTotal: {n_passages:,} passages")

    # ===== CHECK FOR v10.9 CORPORA =====
    # If cached data is missing v10.9 hardcoded corpora, add them
    # Check for sufficient v10.9 data (not just presence, but expected counts)
    sanskrit_count = by_lang.get("sanskrit", 0)
    pali_count = by_lang.get("pali", 0)

    # Also check for v10.9-specific periods by scanning bonds
    has_v109_periods = False
    try:
        with open("data/processed/bonds.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                b = json.loads(line)
                period = b.get("time_period", "")
                if period in ["BUDDHIST", "LEGALIST", "MOHIST", "FIQH", "SUFI", "FALSAFA"]:
                    has_v109_periods = True
                    break
    except:
        pass

    # v10.9 requires: Sanskrit >= 70, Pali >= 70, and v10.9 periods present
    has_full_v109 = sanskrit_count >= 70 and pali_count >= 70 and has_v109_periods

    print(f"\nv10.9 corpus check:")
    print(f"  Sanskrit: {sanskrit_count} (need >= 70)")
    print(f"  Pali: {pali_count} (need >= 70)")
    print(f"  v10.9 periods: {'present' if has_v109_periods else 'missing'}")
    print(f"  Full v10.9: {'YES' if has_full_v109 else 'NO - will add corpora'}")

    if not has_full_v109:
        print("\n" + "=" * 60)
        print("ADDING v10.9 CORPORA TO CACHED DATA")
        print("=" * 60)
        print("(Sanskrit, Pali, Buddhist Chinese, Legalist, Fiqh, Sufi, etc.)")

        # Load existing passages
        all_passages = []
        with open("data/processed/passages.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                all_passages.append(json.loads(line))
        print(f"Loaded {len(all_passages):,} existing passages")

        # Load existing bonds
        existing_bonds = []
        with open("data/processed/bonds.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                existing_bonds.append(json.loads(line))
        print(f"Loaded {len(existing_bonds):,} existing bonds")

        v109_start = len(all_passages)

        # Add Chinese philosophical traditions
        for corpus, period, label in [
            (BUDDHIST_CHINESE, "BUDDHIST", "Buddhist Chinese"),
            (LEGALIST_CHINESE, "LEGALIST", "Legalist Chinese"),
            (MOHIST_CHINESE, "MOHIST", "Mohist Chinese"),
            (NEO_CONFUCIAN_CHINESE, "NEO_CONFUCIAN", "Neo-Confucian"),
        ]:
            for text_content, source_ref, _ in corpus:
                all_passages.append(
                    {
                        "id": f"v109_{label.lower().replace(' ', '_')}_{len(all_passages)}",
                        "text": text_content,
                        "language": "classical_chinese",
                        "source": source_ref,
                        "time_period": period,
                    }
                )

        # Add Arabic/Islamic traditions
        for corpus, period, label in [
            (ISLAMIC_LEGAL_MAXIMS, "FIQH", "Islamic Legal Maxims"),
            (SUFI_ETHICS, "SUFI", "Sufi Ethics"),
            (ARABIC_PHILOSOPHY, "FALSAFA", "Arabic Philosophy"),
        ]:
            for text_content, source_ref, _ in corpus:
                all_passages.append(
                    {
                        "id": f"v109_{label.lower().replace(' ', '_')}_{len(all_passages)}",
                        "text": text_content,
                        "language": "arabic",
                        "source": source_ref,
                        "time_period": period,
                    }
                )

        # Add Sanskrit
        for text_content, source_ref, period_tag in SANSKRIT_DHARMA:
            all_passages.append(
                {
                    "id": f"v109_sanskrit_{len(all_passages)}",
                    "text": text_content,
                    "language": "sanskrit",
                    "source": source_ref,
                    "time_period": period_tag,
                }
            )

        # Add Pali
        for text_content, source_ref, period_tag in PALI_ETHICS:
            all_passages.append(
                {
                    "id": f"v109_pali_{len(all_passages)}",
                    "text": text_content,
                    "language": "pali",
                    "source": source_ref,
                    "time_period": period_tag,
                }
            )

        v109_count = len(all_passages) - v109_start
        print(f"Added {v109_count} v10.9 passages")

        # Extract bonds for new passages
        print("Extracting bonds for v10.9 passages...")
        new_bonds = []
        for p in all_passages[v109_start:]:
            # Simple bond extraction for hardcoded corpora (all are prescriptive)
            new_bonds.append(
                {
                    "passage_id": p["id"],
                    "bond_type": "AUTHORITY",  # Default, will be refined by patterns
                    "language": p["language"],
                    "time_period": p["time_period"],
                    "source": p["source"],
                    "text": p["text"][:500],
                    "context": "prescriptive",
                    "confidence": "high",
                }
            )

        all_bonds = existing_bonds + new_bonds
        print(f"Total bonds: {len(all_bonds):,}")

        # Save updated passages
        with open("data/processed/passages.jsonl", "w", encoding="utf-8") as f:
            for p in all_passages:
                f.write(json.dumps(p, ensure_ascii=False) + "\n")

        # Save updated bonds
        with open("data/processed/bonds.jsonl", "w", encoding="utf-8") as f:
            for b in all_bonds:
                f.write(json.dumps(b, ensure_ascii=False) + "\n")

        # Update Drive cache
        if USE_DRIVE_DATA:
            try:
                shutil.copy("data/processed/passages.jsonl", f"{SAVE_DIR}/passages.jsonl")
                shutil.copy("data/processed/bonds.jsonl", f"{SAVE_DIR}/bonds.jsonl")
                print(f"Updated Drive cache with v10.9 corpora")
            except Exception as e:
                print(f"Drive update failed: {e}")

        # Force splits regeneration since we added new data
        # Delete existing splits so Cell 5 regenerates them
        for splits_path in ["data/splits/all_splits.json", f"{SAVE_DIR}/all_splits.json"]:
            try:
                if os.path.exists(splits_path):
                    os.remove(splits_path)
                    print(f"  Removed old splits: {splits_path}")
            except Exception as e:
                pass
        print("  Splits will be regenerated in Cell 5 to include v10.9 data")

        # Update counts
        by_lang["sanskrit"] = len(SANSKRIT_DHARMA)
        by_lang["pali"] = len(PALI_ETHICS)
        by_lang["classical_chinese"] += sum(
            len(c)
            for c, _, _ in [
                (BUDDHIST_CHINESE, "", ""),
                (LEGALIST_CHINESE, "", ""),
                (MOHIST_CHINESE, "", ""),
                (NEO_CONFUCIAN_CHINESE, "", ""),
            ]
        )
        by_lang["arabic"] += sum(
            len(c)
            for c, _, _ in [
                (ISLAMIC_LEGAL_MAXIMS, "", ""),
                (SUFI_ETHICS, "", ""),
                (ARABIC_PHILOSOPHY, "", ""),
            ]
        )
        n_passages = len(all_passages)

        print(f"\nUpdated corpus sizes:")
        for lang, cnt in sorted(by_lang.items(), key=lambda x: -x[1]):
            print(f"  {lang}: {cnt:,}")
    else:
        print("\nv10.9 corpora already present and complete")

    # Validate corpus sizes and identify what needs augmentation
    print("\nCorpus adequacy check:")
    languages_to_augment = []
    for lang, min_size in MIN_CORPUS_SIZE.items():
        actual = by_lang.get(lang, 0)
        status = "OK" if actual >= min_size else "NEED MORE"
        print(f"  {lang}: {actual:,} / {min_size:,} - {status}")
        if actual < min_size and lang in AUGMENTATION_DATASETS:
            languages_to_augment.append((lang, min_size - actual))

    # Augment any under-represented languages that have available datasets
    if languages_to_augment:
        print(f"\n" + "=" * 60)
        print(f"AUGMENTING UNDER-REPRESENTED CORPORA")
        print(f"=" * 60)
        print(f"Languages to augment: {[l for l, _ in languages_to_augment]}")

        # Load existing passages
        all_passages = []
        with open("data/processed/passages.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                all_passages.append(json.loads(line))

        # Normalize field names
        for p in all_passages:
            if "lang" not in p and "language" in p:
                p["lang"] = p["language"]
            if "period" not in p and "time_period" in p:
                p["period"] = p["time_period"]

        print(f"Loaded {len(all_passages):,} existing passages")

        from datasets import load_dataset

        for lang, needed in languages_to_augment:
            lang_count = by_lang.get(lang, 0)
            print(f"\n--- Augmenting {lang} (need {needed:,} more) ---")

            for dataset_name, short_name in AUGMENTATION_DATASETS.get(lang, []):
                if lang_count >= MIN_CORPUS_SIZE[lang]:
                    break

                print(f"  Loading {short_name}...")
                try:
                    if dataset_name == "hendrycks/ethics":
                        # ETHICS has multiple categories
                        categories = [
                            "commonsense",
                            "deontology",
                            "justice",
                            "utilitarianism",
                            "virtue",
                        ]
                        for cat in categories:
                            if lang_count >= MIN_CORPUS_SIZE[lang]:
                                break
                            try:
                                ds = load_dataset(
                                    dataset_name, cat, split="train", trust_remote_code=True
                                )
                                cat_count = 0
                                for item in ds:
                                    if lang_count >= MIN_CORPUS_SIZE[lang]:
                                        break
                                    if cat == "commonsense":
                                        text = item.get("input", "")
                                    elif cat == "justice":
                                        text = item.get("scenario", "")
                                    elif cat == "deontology":
                                        text = (
                                            item.get("scenario", "") + " " + item.get("excuse", "")
                                        )
                                    elif cat == "virtue":
                                        text = item.get("scenario", "")
                                    else:
                                        text = (
                                            str(item.get("baseline", ""))
                                            + " vs "
                                            + str(item.get("less_pleasant", ""))
                                        )

                                    if text and len(text) > 30:
                                        all_passages.append(
                                            {
                                                "id": f"ethics_{cat}_{len(all_passages)}",
                                                "text": text[:1000],
                                                "lang": lang,
                                                "language": lang,
                                                "source": f"ETHICS_{cat}",
                                                "period": "MODERN",
                                                "time_period": "MODERN",
                                            }
                                        )
                                        lang_count += 1
                                        cat_count += 1
                                print(f"    {cat}: +{cat_count:,}")
                            except Exception as e:
                                print(f"    {cat} error: {e}")

                    elif dataset_name == "allenai/social_chem_101":
                        ds = load_dataset(dataset_name, split="train", trust_remote_code=True)
                        sc_count = 0
                        for item in ds:
                            if lang_count >= MIN_CORPUS_SIZE[lang]:
                                break
                            action = item.get("action", "")
                            situation = item.get("situation", "")
                            rot = item.get("rot", "")

                            if rot and len(rot) > 20:
                                text = f"{situation} {action}".strip() if situation else action
                                text = f"{text}. {rot}" if text else rot

                                all_passages.append(
                                    {
                                        "id": f"socialchem_{len(all_passages)}",
                                        "text": text[:1000],
                                        "lang": lang,
                                        "language": lang,
                                        "source": "Social_Chemistry_101",
                                        "period": "MODERN",
                                        "time_period": "MODERN",
                                    }
                                )
                                lang_count += 1
                                sc_count += 1
                        print(f"    Social Chemistry: +{sc_count:,}")

                    else:
                        # Generic HuggingFace dataset
                        try:
                            ds = load_dataset(dataset_name, split="train", trust_remote_code=True)
                            gen_count = 0
                            for item in ds:
                                if lang_count >= MIN_CORPUS_SIZE[lang]:
                                    break
                                text = item.get("text", "") or item.get("content", "") or str(item)
                                if text and len(text) > 50:
                                    all_passages.append(
                                        {
                                            "id": f"{short_name.lower()}_{len(all_passages)}",
                                            "text": text[:1000],
                                            "lang": lang,
                                            "language": lang,
                                            "source": short_name,
                                            "period": "MODERN",
                                            "time_period": "MODERN",
                                        }
                                    )
                                    lang_count += 1
                                    gen_count += 1
                            print(f"    {short_name}: +{gen_count:,}")
                        except Exception as e:
                            print(f"    {short_name} failed: {e}")

                except Exception as e:
                    print(f"    {short_name} failed: {e}")

            by_lang[lang] = lang_count
            print(f"  {lang} now: {lang_count:,}")

        # Extract bonds for new passages
        print("\nExtracting bonds for new passages...")
        new_bonds = []
        new_sources = {
            "ETHICS_commonsense",
            "ETHICS_deontology",
            "ETHICS_justice",
            "ETHICS_utilitarianism",
            "ETHICS_virtue",
            "Social_Chemistry_101",
        }

        for p in tqdm(all_passages, desc="Processing"):
            src = p.get("source", "")
            if any(src.startswith(s.split("_")[0]) for s in new_sources) or src in new_sources:
                text_lower = p["text"].lower()
                if any(
                    w in text_lower
                    for w in ["wrong", "bad", "shouldn't", "immoral", "rude", "unethical"]
                ):
                    bond_type = "PROHIBITION"
                elif any(w in text_lower for w in ["should", "must", "duty", "obligat", "need to"]):
                    bond_type = "OBLIGATION"
                elif any(
                    w in text_lower for w in ["okay", "fine", "acceptable", "can", "may", "allowed"]
                ):
                    bond_type = "PERMISSION"
                else:
                    bond_type = "NEUTRAL"

                new_bonds.append(
                    {
                        "passage_id": p["id"],
                        "bond_type": bond_type,
                        "language": p.get("language", p.get("lang")),
                        "time_period": p.get("time_period", p.get("period", "MODERN")),
                        "source": src,
                        "text": p["text"][:500],
                        "context": "prescriptive",
                        "confidence": "high",
                    }
                )

        # Load existing bonds and merge
        existing_bonds = []
        with open("data/processed/bonds.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                existing_bonds.append(json.loads(line))

        all_bonds = existing_bonds + new_bonds
        print(f"Total bonds: {len(all_bonds):,} ({len(new_bonds):,} new)")

        # Save updated passages
        with open("data/processed/passages.jsonl", "w", encoding="utf-8") as f:
            for p in all_passages:
                p_out = {
                    "id": p["id"],
                    "text": p["text"],
                    "language": p.get("language", p.get("lang", "english")),
                    "source": p.get("source", ""),
                    "time_period": p.get("time_period", p.get("period", "MODERN")),
                }
                f.write(json.dumps(p_out, ensure_ascii=False) + "\n")

        # Save updated bonds
        with open("data/processed/bonds.jsonl", "w", encoding="utf-8") as f:
            for b in all_bonds:
                f.write(json.dumps(b, ensure_ascii=False) + "\n")

        print("Saved augmented data")

        # Copy to Drive
        if USE_DRIVE_DATA:
            try:
                shutil.copy("data/processed/passages.jsonl", f"{SAVE_DIR}/passages.jsonl")
                shutil.copy("data/processed/bonds.jsonl", f"{SAVE_DIR}/bonds.jsonl")
                print(f"Updated Drive cache: {SAVE_DIR}")
            except Exception as e:
                print(f"Drive update failed: {e}")

        # Final summary
        print(f"\nFinal corpus sizes:")
        for lang, cnt in sorted(by_lang.items(), key=lambda x: -x[1]):
            target = MIN_CORPUS_SIZE.get(lang, 0)
            status = "OK" if cnt >= target else "LOW"
            print(f"  {lang}: {cnt:,} ({status})")
        n_passages = len(all_passages)

else:
    print("=" * 60)
    print("LOADING CORPORA")
    print(f"GPU Tier: {GPU_TIER}")
    print(f"Max per language: {MAX_PER_LANG:,}")
    print("=" * 60)

    random.seed(42)
    all_passages = []

    # ===== PARALLEL PREFETCH MANAGER =====
    from concurrent.futures import ThreadPoolExecutor, Future
    import threading

    print("Starting parallel prefetch of remote corpora...")
    prefetch_executor = ThreadPoolExecutor(max_workers=12)
    prefetch_results = {}  # url -> Future

    def prefetch_url(url, timeout=60):
        """Fetch URL content in background."""
        try:
            resp = requests.get(url, timeout=timeout)
            if resp.status_code == 200:
                return resp.text
        except Exception as e:
            print(f"    Prefetch failed for {url[:50]}...: {e}")
        return None

    # Queue all remote downloads
    PREFETCH_URLS = [
        # Gutenberg - Western Classics
        "https://www.gutenberg.org/cache/epub/1497/pg1497.txt",  # Republic
        "https://www.gutenberg.org/cache/epub/1656/pg1656.txt",  # Apology
        "https://www.gutenberg.org/cache/epub/1657/pg1657.txt",  # Crito
        "https://www.gutenberg.org/cache/epub/1658/pg1658.txt",  # Phaedo
        "https://www.gutenberg.org/cache/epub/3794/pg3794.txt",  # Gorgias
        "https://www.gutenberg.org/cache/epub/1636/pg1636.txt",  # Symposium
        "https://www.gutenberg.org/cache/epub/1726/pg1726.txt",  # Meno
        "https://www.gutenberg.org/cache/epub/8438/pg8438.txt",  # Nicomachean Ethics
        "https://www.gutenberg.org/cache/epub/6762/pg6762.txt",  # Politics
        "https://www.gutenberg.org/cache/epub/2680/pg2680.txt",  # Meditations
        "https://www.gutenberg.org/cache/epub/10661/pg10661.txt",  # Enchiridion
        "https://www.gutenberg.org/cache/epub/3042/pg3042.txt",  # Discourses
        "https://www.gutenberg.org/cache/epub/14988/pg14988.txt",  # De Officiis
        # MIT Classics fallback
        "https://classics.mit.edu/Aristotle/nicomachaen.mb.txt",
        "https://classics.mit.edu/Plato/laws.mb.txt",
        # Bible Parallel Corpus
        "https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/English.xml",
        "https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/Hebrew.xml",
        "https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/Arabic.xml",
        "https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/Chinese.xml",
    ]

    for url in PREFETCH_URLS:
        prefetch_results[url] = prefetch_executor.submit(prefetch_url, url)

    print(f"  Queued {len(PREFETCH_URLS)} URLs for background download")

    def get_prefetched(url, timeout=30):
        """Get prefetched content, waiting if necessary."""
        if url in prefetch_results:
            try:
                return prefetch_results[url].result(timeout=timeout)
            except Exception:
                pass
        # Fallback to direct fetch
        return prefetch_url(url)

    # ===== SEFARIA (Hebrew/Aramaic) =====
    print("\nLoading Sefaria...")
    sefaria_path = Path("data/raw/Sefaria-Export/json")

    CATEGORY_TO_PERIOD = {
        "Tanakh": "BIBLICAL",
        "Torah": "BIBLICAL",
        "Prophets": "BIBLICAL",
        "Writings": "BIBLICAL",
        "Mishnah": "TANNAITIC",
        "Tosefta": "TANNAITIC",
        "Sifra": "TANNAITIC",
        "Sifrei": "TANNAITIC",
        "Talmud": "TALMUDIC",
        "Bavli": "TALMUDIC",
        "Yerushalmi": "TALMUDIC",
        "Midrash": "MIDRASHIC",
        "Midrash Rabbah": "MIDRASHIC",
        "Midrash Aggadah": "MIDRASHIC",
        "Halakhah": "MEDIEVAL",
        "Shulchan Arukh": "MEDIEVAL",
        "Mishneh Torah": "MEDIEVAL",
        "Musar": "MODERN",
        "Chasidut": "MODERN",
        "Modern": "MODERN",
    }

    lang_counts = {"hebrew": 0, "aramaic": 0}

    if sefaria_path.exists():
        for json_file in tqdm(list(sefaria_path.rglob("*.json"))[:5000], desc="Sefaria"):
            try:
                with open(json_file, "r", encoding="utf-8") as f:
                    data = json.load(f)

                if isinstance(data, dict) and "text" in data:
                    # Determine period from path
                    path_parts = str(json_file.relative_to(sefaria_path)).split("/")
                    period = "CLASSICAL"
                    for part in path_parts:
                        if part in CATEGORY_TO_PERIOD:
                            period = CATEGORY_TO_PERIOD[part]
                            break

                    # Determine language (heuristic: Talmud is primarily Aramaic)
                    is_talmud = any(t in str(json_file) for t in ["Talmud", "Bavli", "Yerushalmi"])
                    lang = "aramaic" if is_talmud else "hebrew"

                    def extract_texts(obj, texts):
                        if isinstance(obj, str) and len(obj) > 20:
                            texts.append(obj)
                        elif isinstance(obj, list):
                            for item in obj:
                                extract_texts(item, texts)

                    texts = []
                    extract_texts(data["text"], texts)

                    for txt in texts[:50]:  # Limit per file
                        if lang_counts[lang] < MAX_PER_LANG:
                            all_passages.append(
                                {
                                    "id": f"sefaria_{len(all_passages)}",
                                    "text": txt,
                                    "lang": lang,
                                    "source": json_file.stem,
                                    "period": period,
                                }
                            )
                            lang_counts[lang] += 1

            except Exception as e:
                continue
    else:
        print("  Sefaria not found - will download")

    print(f"  Hebrew: {lang_counts['hebrew']:,}, Aramaic: {lang_counts['aramaic']:,}")

    # ===== CLASSICAL CHINESE: Disabled (CText API blocks Colab) =====
    print("  Skipping CText API (blocked from Colab, using Wenyanwen instead)")
    chinese_count = 0  # Initialize counter

    # ===== KAGGLE: Ancient Chinese Wenyanwen (132K texts, 552M chars) =====
    if chinese_count < MAX_PER_LANG:
        print("  Loading from Kaggle Wenyanwen dataset...")
        wenyan_zip_name = "Ancient_Chinese_Text_(wenyanwen)_archive.zip"
        wenyan_csv_name = "cn_wenyan.csv"
        wenyan_local_zip = Path(f"data/raw/{wenyan_zip_name}")
        _drive_ok = "USE_DRIVE_DATA" in dir() and USE_DRIVE_DATA and "SAVE_DIR" in dir()
        wenyan_drive_zip = Path(f"{SAVE_DIR}/{wenyan_zip_name}") if _drive_ok else None
        wenyan_local_csv = Path(f"data/raw/{wenyan_csv_name}")
        wenyan_drive_csv = Path(f"{SAVE_DIR}/{wenyan_csv_name}") if _drive_ok else None

        # Find the CSV (extracted or in zip)
        csv_path = None
        if wenyan_local_csv.exists():
            csv_path = wenyan_local_csv
            print("    Found CSV locally")
        elif wenyan_drive_csv and wenyan_drive_csv.exists():
            csv_path = wenyan_drive_csv
            print("    Found CSV in Drive")
        else:
            # Need to extract from zip
            zip_path = None
            if wenyan_local_zip.exists():
                zip_path = wenyan_local_zip
                print("    Found zip locally")
            elif wenyan_drive_zip and wenyan_drive_zip.exists():
                zip_path = wenyan_drive_zip
                print("    Found zip in Drive")

            if zip_path:
                try:
                    import zipfile

                    print("    Extracting CSV from zip...")
                    with zipfile.ZipFile(zip_path, "r") as z:
                        z.extract(wenyan_csv_name, "data/raw/")
                    csv_path = wenyan_local_csv
                    print("    Extracted!")
                except Exception as e:
                    print(f"    Extraction failed: {e}")

        # Load texts from CSV
        wenyan_count = 0
        if csv_path and csv_path.exists():
            import csv

            csv.field_size_limit(10000000)  # Some texts are very long
            try:
                with open(csv_path, "r", encoding="utf-8", errors="ignore") as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        if chinese_count >= MAX_PER_LANG:
                            break
                        text = row.get("text", "")
                        title = row.get("title", "")
                        # Split long texts into passages (max 2000 chars each)
                        # Use paragraph breaks or every 1500 chars
                        paragraphs = text.split("\n")
                        current_para = ""
                        for para in paragraphs:
                            para = para.strip()
                            if not para:
                                continue
                            if len(current_para) + len(para) < 1500:
                                current_para += para
                            else:
                                if len(current_para) > 50:
                                    all_passages.append(
                                        {
                                            "id": f"wenyan_{len(all_passages)}",
                                            "text": current_para,
                                            "lang": "classical_chinese",
                                            "source": (
                                                title.split("/")[0] if "/" in title else title
                                            ),
                                            "period": "CONFUCIAN",
                                        }
                                    )
                                    chinese_count += 1
                                    wenyan_count += 1
                                    if chinese_count >= MAX_PER_LANG:
                                        break
                                current_para = para
                        # Don't forget last paragraph
                        if current_para and len(current_para) > 50 and chinese_count < MAX_PER_LANG:
                            all_passages.append(
                                {
                                    "id": f"wenyan_{len(all_passages)}",
                                    "text": current_para,
                                    "lang": "classical_chinese",
                                    "source": title.split("/")[0] if "/" in title else title,
                                    "period": "CONFUCIAN",
                                }
                            )
                            chinese_count += 1
                            wenyan_count += 1
                print(f"    Added {wenyan_count:,} passages from Wenyanwen")
            except Exception as e:
                print(f"    Error loading Wenyanwen: {e}")

    print(f"  Total Classical Chinese: {chinese_count:,}")

    # ===== ARABIC/ISLAMIC (Kaggle quran-nlp) =====
    print("\nLoading Arabic from Kaggle quran-nlp...")

    arabic_count = 0
    kaggle_path = Path("data/raw/quran-nlp")

    # Try to download from Kaggle (in Refresh all OR Update missing mode)
    if not kaggle_path.exists() and not CACHE_ONLY:
        try:
            import subprocess
            import zipfile

            subprocess.run(["pip", "install", "-q", "kaggle"], check=True)
            subprocess.run(
                [
                    "kaggle",
                    "datasets",
                    "download",
                    "-d",
                    "alizahidraja/quran-nlp",
                    "-p",
                    "data/raw",
                ],
                check=True,
                timeout=300,
            )

            with zipfile.ZipFile("data/raw/quran-nlp.zip", "r") as z:
                z.extractall(kaggle_path)
            print("  Downloaded from Kaggle!")
        except Exception as e:
            print(f"  Kaggle download failed: {e}")

    # Load if available
    if kaggle_path.exists():
        import pandas as pd

        # Load Quran
        quran_files = list(kaggle_path.rglob("*quran*.csv"))
        for qf in quran_files:
            if arabic_count >= MAX_PER_LANG:
                break
            try:
                df = pd.read_csv(qf, nrows=MAX_PER_LANG - arabic_count)
                for _, row in df.iterrows():
                    text = str(row.get("arabic", row.get("text", row.get("Arabic", ""))))
                    if text and len(text) > 10 and text != "nan":
                        all_passages.append(
                            {
                                "id": f"quran_{len(all_passages)}",
                                "text": text,
                                "lang": "arabic",
                                "source": "Quran",
                                "period": "QURANIC",
                            }
                        )
                        arabic_count += 1
            except:
                continue

        # Load Hadith
        hadith_files = list(kaggle_path.rglob("*hadith*.csv"))
        for hf in hadith_files:
            if arabic_count >= MAX_PER_LANG:
                break
            try:
                df = pd.read_csv(hf, nrows=MAX_PER_LANG - arabic_count)
                for _, row in df.iterrows():
                    text = str(row.get("hadith", row.get("text", row.get("Arabic", ""))))
                    if text and len(text) > 10 and text != "nan":
                        all_passages.append(
                            {
                                "id": f"hadith_{len(all_passages)}",
                                "text": text,
                                "lang": "arabic",
                                "source": "Hadith",
                                "period": "HADITH",
                            }
                        )
                        arabic_count += 1
            except:
                continue
    else:
        # Try Tanzil.net (simple direct download)
        print("  Trying Tanzil.net for Quran text...")
        try:
            tanzil_url = "https://tanzil.net/pub/download/index.php?quranType=uthmani&outType=txt-2&agree=true"
            resp = requests.get(tanzil_url, timeout=60)
            if resp.status_code == 200:
                lines = resp.text.strip().split("\n")
                for line in lines:
                    if "|" in line and arabic_count < MAX_PER_LANG:
                        parts = line.split("|")
                        if len(parts) >= 3:
                            text = parts[2].strip()
                            if len(text) > 10:
                                all_passages.append(
                                    {
                                        "id": f"tanzil_{len(all_passages)}",
                                        "text": text,
                                        "lang": "arabic",
                                        "source": "Quran (Tanzil)",
                                        "period": "QURANIC",
                                    }
                                )
                                arabic_count += 1
                print(f"    Loaded {arabic_count} verses from Tanzil")
        except Exception as e:
            print(f"    Tanzil failed: {e}")

        # Final fallback: expanded hardcoded corpus
        if arabic_count < 100:
            print("  Using expanded hardcoded Arabic corpus...")
        ARABIC_CORPUS = [
            # Quran excerpts (moral/ethical content)
            "وَلَا تَقْتُلُوا النَّفْسَ الَّتِي حَرَّمَ اللَّهُ إِلَّا بِالْحَقِّ",
            "وَبِالْوَالِدَيْنِ إِحْسَانًا",
            "وَأَوْفُوا بِالْعَهْدِ إِنَّ الْعَهْدَ كَانَ مَسْئُولًا",
            "إِنَّ اللَّهَ يَأْمُرُ بِالْعَدْلِ وَالْإِحْسَانِ",
            "وَلَا تَبْخَسُوا النَّاسَ أَشْيَاءَهُمْ",
            "وَأَقِيمُوا الْوَزْنَ بِالْقِسْطِ وَلَا تُخْسِرُوا الْمِيزَانَ",
            "يَا أَيُّهَا الَّذِينَ آمَنُوا أَوْفُوا بِالْعُقُودِ",
            "وَتَعَاوَنُوا عَلَى الْبِرِّ وَالتَّقْوَى",
            # ... more can be added
        ]
        for i, txt in enumerate(ARABIC_CORPUS):
            all_passages.append(
                {
                    "id": f"arabic_{len(all_passages)}",
                    "text": txt,
                    "lang": "arabic",
                    "source": "Quran/Hadith",
                    "period": "QURANIC",
                }
            )
            arabic_count += 1

    print(f"  Arabic: {arabic_count:,}")

    # ===== DEAR ABBY (English) =====
    print("Loading Dear Abby...")

    english_count = 0
    abby_path = Path("data/raw/dear_abby.csv")
    print(f"  Local path exists: {abby_path.exists()}")

    # Check Drive first
    drive_abby = f"{SAVE_DIR}/dear_abby.csv"
    print(f"  Drive path: {drive_abby}")
    print(f"  Drive path exists: {os.path.exists(drive_abby)}")
    if not abby_path.exists() and os.path.exists(drive_abby):
        os.makedirs("data/raw", exist_ok=True)
        shutil.copy(drive_abby, abby_path)
        print("  Copied from Drive")

    if not abby_path.exists() and not CACHE_ONLY:
        try:
            import subprocess

            subprocess.run(["pip", "install", "-q", "kaggle"], check=True)
            subprocess.run(
                [
                    "kaggle",
                    "datasets",
                    "download",
                    "-d",
                    "thedevastator/20000-dear-abby-questions",
                    "-p",
                    "data/raw",
                    "-f",
                    "dear_abby.csv",
                ],
                check=True,
                timeout=120,
            )
            print("  Downloaded from Kaggle!")
        except Exception as e:
            print(f"  Kaggle download failed: {e}")

    if abby_path.exists():
        import pandas as pd

        df = pd.read_csv(abby_path, nrows=MAX_PER_LANG)
        print(f"  CSV columns: {list(df.columns)}")
        print(f"  CSV rows: {len(df)}")
        for _, row in df.iterrows():
            question = str(row.get("question", ""))
            answer = str(row.get("question_only", ""))
            if len(answer) > 50:
                all_passages.append(
                    {
                        "id": f"abby_{len(all_passages)}",
                        "text": answer,
                        "lang": "english",
                        "source": "Dear Abby",
                        "period": "DEAR_ABBY",
                    }
                )
                english_count += 1
    else:
        print("  Dear Abby not found")

    print(f"  Dear Abby: {english_count:,}")

    # ===== WESTERN CLASSICS (Greek/Roman Philosophy) =====
    print("\nLoading Western Classics (parallel download)...")

    from concurrent.futures import ThreadPoolExecutor, as_completed

    # Project Gutenberg texts (reliable, plain text)
    GUTENBERG_TEXTS = [
        # Plato - Ethics & Political Philosophy
        (1497, "Republic", "Plato"),
        (1656, "Apology", "Plato"),
        (1657, "Crito", "Plato"),
        (1658, "Phaedo", "Plato"),
        (3794, "Gorgias", "Plato"),
        (1636, "Symposium", "Plato"),
        (1726, "Meno", "Plato"),
        # Aristotle
        (8438, "Nicomachean Ethics", "Aristotle"),
        (6762, "Politics", "Aristotle"),
        # Stoics
        (2680, "Meditations", "Marcus Aurelius"),
        (10661, "Enchiridion", "Epictetus"),
        (3042, "Discourses", "Epictetus"),
        # Cicero
        (14988, "De Officiis", "Cicero"),
    ]

    # MIT Classics fallback
    MIT_TEXTS = [
        (
            "https://classics.mit.edu/Aristotle/nicomachaen.mb.txt",
            "Nicomachean Ethics",
            "Aristotle",
        ),
        ("https://classics.mit.edu/Aristotle/politics.mb.txt", "Politics", "Aristotle"),
        ("https://classics.mit.edu/Plato/republic.mb.txt", "Republic", "Plato"),
        ("https://classics.mit.edu/Plato/laws.mb.txt", "Laws", "Plato"),
        ("https://classics.mit.edu/Antoninus/meditations.mb.txt", "Meditations", "Marcus Aurelius"),
        ("https://classics.mit.edu/Epictetus/epicench.mb.txt", "Enchiridion", "Epictetus"),
        ("https://classics.mit.edu/Cicero/duties.mb.txt", "De Officiis", "Cicero"),
    ]

    western_target = min(MAX_PER_LANG, 15000)

    def fetch_gutenberg(item):
        """Fetch a single Gutenberg text (uses prefetch if available)."""
        gutenberg_id, title, author = item
        try:
            url = f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.txt"
            text = get_prefetched(url)
            if text:
                # Skip Gutenberg header/footer
                for marker in ["*** START OF", "***START OF"]:
                    if marker in text:
                        text = text.split(marker, 1)[-1]
                        break
                for marker in ["*** END OF", "***END OF", "End of Project Gutenberg"]:
                    if marker in text:
                        text = text.split(marker, 1)[0]
                        break

                paragraphs = [p.strip() for p in text.split("\n\n") if len(p.strip()) > 100]
                passages = []
                for para in paragraphs:
                    para = re.sub(r"\s+", " ", para).strip()
                    if 50 < len(para) < 2000:
                        passages.append(
                            {
                                "text": para,
                                "source": f"{author}: {title}",
                                "author": author,
                                "title": title,
                            }
                        )
                return (title, author, passages)
        except Exception as e:
            pass
        return (title, author, [])

    def fetch_mit(item):
        """Fetch a single MIT Classics text (uses prefetch if available)."""
        url, title, author = item
        try:
            text = get_prefetched(url)
            if text:
                paragraphs = [p.strip() for p in text.split("\n\n") if len(p.strip()) > 100]
                passages = []
                for para in paragraphs[:500]:
                    para = re.sub(r"\s+", " ", para).strip()
                    if 50 < len(para) < 2000:
                        passages.append(
                            {
                                "text": para,
                                "source": f"{author}: {title}",
                                "author": author,
                                "title": title,
                            }
                        )
                return (title, author, passages)
        except:
            pass
        return (title, author, [])

    western_passages = []
    loaded_titles = set()

    # Parallel fetch from Gutenberg
    print("  Fetching from Project Gutenberg (parallel)...")
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(fetch_gutenberg, item): item for item in GUTENBERG_TEXTS}
        for future in as_completed(futures):
            title, author, passages = future.result()
            if passages and title not in loaded_titles:
                western_passages.extend(passages)
                loaded_titles.add(title)
                print(f"    {author}: {title} - {len(passages)} passages")

    # Parallel fetch from MIT for any missing
    missing_mit = [(url, t, a) for url, t, a in MIT_TEXTS if t not in loaded_titles]
    if missing_mit:
        print("  Fetching missing texts from MIT Classics...")
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = {executor.submit(fetch_mit, item): item for item in missing_mit}
            for future in as_completed(futures):
                title, author, passages = future.result()
                if passages and title not in loaded_titles:
                    western_passages.extend(passages)
                    loaded_titles.add(title)
                    print(f"    {author}: {title} - {len(passages)} passages (MIT)")

    # Add to all_passages with proper IDs
    western_count = 0
    for p in western_passages:
        if western_count >= western_target:
            break
        all_passages.append(
            {
                "id": f"western_{len(all_passages)}",
                "text": p["text"],
                "lang": "english",
                "source": p["source"],
                "period": "WESTERN_CLASSICAL",
                "time_period": "WESTERN_CLASSICAL",
            }
        )
        western_count += 1

    print(f"  Total Western Classics: {western_count:,}")
    # ===== UNIMORAL: Disabled (gated dataset requires auth) =====
    print("  Skipping UniMoral (gated HuggingFace dataset)")

    # ===== UN PARALLEL CORPUS (HuggingFace streaming) =====
    print("\nLoading UN Corpus from HuggingFace (streaming)...")
    try:
        from datasets import load_dataset

        pairs = [("ar", "en"), ("en", "zh")]
        un_count = 0
        lang_map = {"ar": "arabic", "zh": "classical_chinese", "en": "english"}

        for src, tgt in pairs:
            if un_count >= MAX_PER_LANG:
                break
            try:
                config = f"{src}-{tgt}"
                ds = load_dataset("Helsinki-NLP/un_pc", config, split="train", streaming=True)

                pair_count = 0
                for item in ds:
                    if pair_count >= min(MAX_PER_LANG // 4, 5000):
                        break

                    translation = item.get("translation", {})
                    for lang_code in [src, tgt]:
                        text = translation.get(lang_code, "")
                        if len(text) > 30 and lang_code in lang_map:
                            all_passages.append(
                                {
                                    "id": f"un_{len(all_passages)}",
                                    "text": text,
                                    "lang": lang_map[lang_code],
                                    "source": "UN Corpus",
                                    "period": "MODERN",
                                }
                            )
                            pair_count += 1
                            un_count += 1

                print(f"  UN {config}: {pair_count:,}")
            except Exception as e:
                print(f"  UN {config} error: {e}")

        print(f"  UN Corpus total: {un_count:,}")
    except Exception as e:
        print(f"  UN Corpus error: {e}")

    # ===== BIBLE PARALLEL CORPUS (GitHub) =====
    print("\nLoading Bible Parallel Corpus...")
    try:
        base_url = "https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles"
        bible_files = [
            ("Hebrew.xml", "hebrew"),
            ("Arabic.xml", "arabic"),
            ("Chinese.xml", "classical_chinese"),
        ]

        bible_count = 0
        for filename, lang in bible_files:
            if bible_count >= MAX_PER_LANG * 3:
                break
            try:
                url = f"{base_url}/{filename}"
                text = get_prefetched(url)
                if text:
                    verses = re.findall(r"<seg[^>]*>([^<]+)</seg>", text)
                    file_count = 0
                    for verse in verses:
                        if file_count >= MAX_PER_LANG:
                            break
                        verse = verse.strip()
                        if len(verse) > 10:
                            all_passages.append(
                                {
                                    "id": f"bible_{len(all_passages)}",
                                    "text": verse,
                                    "lang": lang,
                                    "source": "Bible",
                                    "period": "CLASSICAL",
                                }
                            )
                            file_count += 1
                            bible_count += 1
                    print(f"  Bible {lang}: {file_count:,}")
            except Exception as e:
                print(f"  Bible {filename} error: {e}")

        print(f"  Bible total: {bible_count:,}")
    except Exception as e:
        print(f"  Bible error: {e}")

    # ===== NEW v10.9 CORPORA =====
    print("\n--- v10.9 New Corpora (hardcoded) ---")

    # Chinese philosophical traditions
    chinese_corpora = [
        (BUDDHIST_CHINESE, "BUDDHIST", "Buddhist Chinese"),
        (LEGALIST_CHINESE, "LEGALIST", "Legalist Chinese"),
        (MOHIST_CHINESE, "MOHIST", "Mohist Chinese"),
        (NEO_CONFUCIAN_CHINESE, "NEO_CONFUCIAN", "Neo-Confucian"),
    ]

    for corpus, period, label in chinese_corpora:
        count = 0
        for text_content, source_ref, _ in corpus:
            all_passages.append(
                {
                    "id": f"v109_{label.lower().replace(' ', '_')}_{len(all_passages)}",
                    "text": text_content,
                    "lang": "classical_chinese",
                    "source": source_ref,
                    "period": period,
                }
            )
            count += 1
        print(f"  {label}: {count}")

    # Arabic/Islamic traditions
    arabic_corpora = [
        (ISLAMIC_LEGAL_MAXIMS, "FIQH", "Islamic Legal Maxims"),
        (SUFI_ETHICS, "SUFI", "Sufi Ethics"),
        (ARABIC_PHILOSOPHY, "FALSAFA", "Arabic Philosophy"),
    ]

    for corpus, period, label in arabic_corpora:
        count = 0
        for text_content, source_ref, _ in corpus:
            all_passages.append(
                {
                    "id": f"v109_{label.lower().replace(' ', '_')}_{len(all_passages)}",
                    "text": text_content,
                    "lang": "arabic",
                    "source": source_ref,
                    "period": period,
                }
            )
            count += 1
        print(f"  {label}: {count}")

    # Sanskrit tradition
    sanskrit_count = 0
    for text_content, source_ref, period_tag in SANSKRIT_DHARMA:
        all_passages.append(
            {
                "id": f"v109_sanskrit_{len(all_passages)}",
                "text": text_content,
                "lang": "sanskrit",
                "source": source_ref,
                "period": period_tag,
            }
        )
        sanskrit_count += 1
    print(f"  Sanskrit Dharma: {sanskrit_count}")

    # Pali tradition
    pali_count = 0
    for text_content, source_ref, period_tag in PALI_ETHICS:
        all_passages.append(
            {
                "id": f"v109_pali_{len(all_passages)}",
                "text": text_content,
                "lang": "pali",
                "source": source_ref,
                "period": period_tag,
            }
        )
        pali_count += 1
    print(f"  Pali Ethics: {pali_count}")

    # Cleanup prefetch executor
    print("\nWaiting for any remaining prefetch tasks...")
    prefetch_executor.shutdown(wait=False)

    # ===== SUMMARY =====
    print(f"\nTOTAL: {len(all_passages):,}")

    # Count by language
    by_lang = defaultdict(int)
    for p in all_passages:
        by_lang[p["lang"]] += 1
    print("\nBy language:")
    for lang, cnt in sorted(by_lang.items(), key=lambda x: -x[1]):
        print(f"  {lang}: {cnt:,}")

    # ===== EXTRACT BONDS =====
    print("\n" + "=" * 60)
    print("EXTRACTING BONDS")
    print("=" * 60)

    def extract_bond(text, language):
        """Extract bond type with context awareness."""
        tn = normalize_text(text, language)

        for bt, pats in ALL_BOND_PATTERNS.get(language, {}).items():
            for p in pats:
                match = re.search(p, tn)
                if match:
                    # Check context around the match
                    context, marker_type = detect_context(text, language, match.start())
                    confidence = 0.9 if context == "prescriptive" else 0.5
                    return bt, context, confidence
        return None, "unknown", 0.5

    bonds = []
    for p in tqdm(all_passages, desc="Extracting bonds"):
        bt, ctx, conf = extract_bond(p["text"], p["lang"])
        if bt:
            bonds.append(
                {
                    "passage_id": p["id"],
                    "bond_type": bt,
                    "language": p["lang"],
                    "time_period": p["period"],
                    "source": p["source"],
                    "text": p["text"][:500],
                    "context": ctx,
                    "confidence": conf,
                }
            )

    print(f"\nExtracted {len(bonds):,} bonds from {len(all_passages):,} passages")

    # Count by bond type
    by_bond = defaultdict(int)
    for b in bonds:
        by_bond[b["bond_type"]] += 1
    print("\nBy bond type:")
    for bt, cnt in sorted(by_bond.items(), key=lambda x: -x[1]):
        print(f"  {bt}: {cnt:,}")

    # Count by context
    by_ctx = defaultdict(int)
    for b in bonds:
        by_ctx[b["context"]] += 1
    print("\nBy context:")
    for ctx, cnt in sorted(by_ctx.items(), key=lambda x: -x[1]):
        print(f"  {ctx}: {cnt:,}")

    # ===== SAVE =====
    print("\n" + "=" * 60)
    print("SAVING DATA")
    print("=" * 60)

    # Save passages
    with open("data/processed/passages.jsonl", "w", encoding="utf-8") as f:
        for p in all_passages:
            # Normalize field names
            p_out = {
                "id": p["id"],
                "text": p["text"],
                "language": p["lang"],
                "source": p["source"],
                "time_period": p["period"],
            }
            f.write(json.dumps(p_out, ensure_ascii=False) + "\n")
    print(f"  Saved {len(all_passages):,} passages to data/processed/passages.jsonl")

    # Save bonds
    with open("data/processed/bonds.jsonl", "w", encoding="utf-8") as f:
        for b in bonds:
            b_out = {
                **b,
                "bond_type": (
                    b["bond_type"].name if hasattr(b["bond_type"], "name") else str(b["bond_type"])
                ),
            }
            f.write(json.dumps(b_out, ensure_ascii=False) + chr(10))
    print(f"  Saved {len(bonds):,} bonds to data/processed/bonds.jsonl")

    # Copy to Drive if enabled
    if USE_DRIVE_DATA and SAVE_DIR:
        try:
            os.makedirs(SAVE_DIR, exist_ok=True)
            shutil.copy("data/processed/passages.jsonl", f"{SAVE_DIR}/passages.jsonl")
            shutil.copy("data/processed/bonds.jsonl", f"{SAVE_DIR}/bonds.jsonl")
            print(f"  Copied to Drive: {SAVE_DIR}")
        except Exception as e:
            print(f"  Drive copy failed: {e}")

    gc.collect()
    print("\nDone!")


In [None]:
# @title 5. Generate Splits { display-mode: "form" }
# @markdown Creates train/test splits for cross-lingual experiments
# @markdown v10.9: Added confucian_to_buddhist, confucian_to_legalist,
# @markdown        all_to_sanskrit, semitic_to_indic, quran_to_fiqh

import json
import random
import shutil
from collections import defaultdict

print("=" * 60)
print("GENERATING SPLITS")
print("=" * 60)

# Check if splits already exist from Drive
# Check if splits are valid (IDs match current passages)
splits_valid = False
if os.path.exists("data/splits/all_splits.json"):
    try:
        with open("data/splits/all_splits.json") as f:
            cached_splits = json.load(f)
        # Get sample of IDs from splits
        sample_ids = set()
        for split in cached_splits.values():
            sample_ids.update(split["train_ids"][:100])
            sample_ids.update(split["test_ids"][:100])
        # Check if they exist in current passages
        passage_ids = set()
        with open("data/processed/passages.jsonl") as f:
            for line in f:
                p = json.loads(line)
                passage_ids.add(p["id"])
                if len(passage_ids) > 10000:
                    break
        matches = len(sample_ids & passage_ids)
        splits_valid = matches > len(sample_ids) * 0.9  # 90% match
        if not splits_valid:
            print(f"Splits invalid: only {matches}/{len(sample_ids)} IDs match current passages")
    except Exception as e:
        print(f"Error validating splits: {e}")

if splits_valid and not REFRESH_DATA_FROM_SOURCE:
    print("\nSplits already loaded from Drive")
    with open("data/splits/all_splits.json") as f:
        all_splits = json.load(f)
    for name, split in all_splits.items():
        print(f"  {name}: train={split['train_size']:,}, test={split['test_size']:,}")
else:
    random.seed(42)

    # Read passage metadata
    passage_meta = []
    with open("data/processed/passages.jsonl", "r") as f:
        for line in f:
            p = json.loads(line)
            passage_meta.append(p)

    print(f"Total passages: {len(passage_meta):,}")

    by_lang = defaultdict(list)
    by_period = defaultdict(list)
    for p in passage_meta:
        by_lang[p["language"]].append(p["id"])
        by_period[p["time_period"]].append(p["id"])

    print("\nBy language:")
    for lang, ids in sorted(by_lang.items(), key=lambda x: -len(x[1])):
        print(f"  {lang}: {len(ids):,}")

    print("\nBy period:")
    for period, ids in sorted(by_period.items(), key=lambda x: -len(x[1])):
        print(f"  {period}: {len(ids):,}")

    all_splits = {}
    # ===== SPLIT 1: Hebrew -> Others =====
    print("\n" + "-" * 60)
    print("SPLIT 1: HEBREW -> OTHERS")
    hebrew_ids = by_lang.get("hebrew", [])
    other_ids = [p["id"] for p in passage_meta if p["language"] != "hebrew"]
    random.shuffle(hebrew_ids)
    random.shuffle(other_ids)

    all_splits["hebrew_to_others"] = {
        "train_ids": hebrew_ids,
        "test_ids": other_ids,
        "train_size": len(hebrew_ids),
        "test_size": len(other_ids),
    }
    print(f"  Train (Hebrew): {len(hebrew_ids):,}")
    print(f"  Test (Others): {len(other_ids):,}")

    # ===== SPLIT 2: Semitic -> Non-Semitic =====
    print("\n" + "-" * 60)
    print("SPLIT 2: SEMITIC -> NON-SEMITIC")
    semitic_ids = by_lang.get("hebrew", []) + by_lang.get("aramaic", []) + by_lang.get("arabic", [])
    non_semitic_ids = by_lang.get("classical_chinese", []) + by_lang.get("english", [])
    random.shuffle(semitic_ids)
    random.shuffle(non_semitic_ids)

    all_splits["semitic_to_non_semitic"] = {
        "train_ids": semitic_ids,
        "test_ids": non_semitic_ids,
        "train_size": len(semitic_ids),
        "test_size": len(non_semitic_ids),
    }
    print(f"  Train (Semitic): {len(semitic_ids):,}")
    print(f"  Test (Non-Semitic): {len(non_semitic_ids):,}")

    # ===== SPLIT 3: Ancient -> Modern =====
    print("\n" + "-" * 60)
    print("SPLIT 3: ANCIENT -> MODERN")
    # Define modern periods explicitly, derive ancient dynamically
    modern_periods = {"MODERN", "DEAR_ABBY"}
    all_periods = set(by_period.keys())
    ancient_periods = all_periods - modern_periods

    print(f"  Ancient periods: {sorted(ancient_periods)}")
    print(f"  Modern periods: {sorted(modern_periods)}")

    ancient_ids = [p["id"] for p in passage_meta if p["time_period"] in ancient_periods]
    modern_ids = [p["id"] for p in passage_meta if p["time_period"] in modern_periods]
    random.shuffle(ancient_ids)
    random.shuffle(modern_ids)

    all_splits["ancient_to_modern"] = {
        "train_ids": ancient_ids,
        "test_ids": modern_ids,
        "train_size": len(ancient_ids),
        "test_size": len(modern_ids),
    }
    print(f"  Train (Ancient): {len(ancient_ids):,}")
    print(f"  Test (Modern): {len(modern_ids):,}")

    # ===== SPLIT 4: Mixed Baseline =====
    print("\n" + "-" * 60)
    print("SPLIT 4: MIXED BASELINE")
    all_ids = [p["id"] for p in passage_meta]
    random.shuffle(all_ids)
    split_idx = int(0.7 * len(all_ids))

    all_splits["mixed_baseline"] = {
        "train_ids": all_ids[:split_idx],
        "test_ids": all_ids[split_idx:],
        "train_size": split_idx,
        "test_size": len(all_ids) - split_idx,
    }
    print(f"  Train: {split_idx:,}")
    print(f"  Test: {len(all_ids) - split_idx:,}")

    # ===== SPLIT 5: Dear Abby -> Classical Chinese =====
    print("\n" + "-" * 60)
    print("SPLIT 5: DEAR ABBY -> CHINESE")
    abby_ids = [p["id"] for p in passage_meta if p["time_period"] == "DEAR_ABBY"]
    chinese_ids = [p["id"] for p in passage_meta if p["language"] == "classical_chinese"]
    random.shuffle(abby_ids)
    random.shuffle(chinese_ids)

    all_splits["abby_to_chinese"] = {
        "train_ids": abby_ids,
        "test_ids": chinese_ids,
        "train_size": len(abby_ids),
        "test_size": len(chinese_ids),
    }
    print(f"  Train (Dear Abby): {len(abby_ids):,}")
    print(f"  Test (Chinese): {len(chinese_ids):,}")

    # ===== SPLIT 6: Western Classical -> Eastern =====
    print("\n" + "-" * 60)
    print("SPLIT 6: WESTERN CLASSICAL -> EASTERN")
    western_ids = [p["id"] for p in passage_meta if p["time_period"] == "WESTERN_CLASSICAL"]
    eastern_ids = [
        p["id"] for p in passage_meta if p["language"] in ("classical_chinese", "hebrew")
    ]
    random.shuffle(western_ids)
    random.shuffle(eastern_ids)

    all_splits["western_to_eastern"] = {
        "train_ids": western_ids,
        "test_ids": eastern_ids,
        "train_size": len(western_ids),
        "test_size": len(eastern_ids),
    }
    print(f"  Train (Western - Plato, Aristotle, Stoics): {len(western_ids):,}")
    print(f"  Test (Eastern - Chinese, Hebrew): {len(eastern_ids):,}")

    # ===== SPLIT 7: Confucian -> Buddhist (v10.9) =====
    print("\n" + "-" * 60)
    print("SPLIT 7: CONFUCIAN -> BUDDHIST (Chinese intra-tradition)")
    confucian_daoist_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] in ("CONFUCIAN", "DAOIST") and p["language"] == "classical_chinese"
    ]
    buddhist_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] == "BUDDHIST" and p["language"] == "classical_chinese"
    ]
    random.shuffle(confucian_daoist_ids)
    random.shuffle(buddhist_ids)

    all_splits["confucian_to_buddhist"] = {
        "train_ids": confucian_daoist_ids,
        "test_ids": buddhist_ids,
        "train_size": len(confucian_daoist_ids),
        "test_size": len(buddhist_ids),
        "description": "Test if Chinese performance is tradition-specific",
    }
    print(f"  Train (Confucian+Daoist): {len(confucian_daoist_ids):,}")
    print(f"  Test (Buddhist): {len(buddhist_ids):,}")

    # ===== SPLIT 8: Confucian -> Legalist/Mohist (v10.9) =====
    print("\n" + "-" * 60)
    print("SPLIT 8: CONFUCIAN -> LEGALIST/MOHIST (virtue vs consequentialist)")
    confucian_only_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] == "CONFUCIAN" and p["language"] == "classical_chinese"
    ]
    legalist_mohist_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] in ("LEGALIST", "MOHIST") and p["language"] == "classical_chinese"
    ]
    random.shuffle(confucian_only_ids)
    random.shuffle(legalist_mohist_ids)

    all_splits["confucian_to_legalist"] = {
        "train_ids": confucian_only_ids,
        "test_ids": legalist_mohist_ids,
        "train_size": len(confucian_only_ids),
        "test_size": len(legalist_mohist_ids),
        "description": "Virtue ethics → consequentialist/legalist",
    }
    print(f"  Train (Confucian): {len(confucian_only_ids):,}")
    print(f"  Test (Legalist+Mohist): {len(legalist_mohist_ids):,}")

    # ===== SPLIT 9: All -> Sanskrit/Pali (v10.9) =====
    print("\n" + "-" * 60)
    print("SPLIT 9: ALL -> SANSKRIT/PALI (ultimate transfer test)")
    non_indic_ids = [
        p["id"]
        for p in passage_meta
        if p["language"] in ("hebrew", "aramaic", "classical_chinese", "arabic", "english")
    ]
    indic_ids = [p["id"] for p in passage_meta if p["language"] in ("sanskrit", "pali")]
    random.shuffle(non_indic_ids)
    random.shuffle(indic_ids)

    all_splits["all_to_sanskrit"] = {
        "train_ids": non_indic_ids,
        "test_ids": indic_ids,
        "train_size": len(non_indic_ids),
        "test_size": len(indic_ids),
        "description": "Ultimate transfer test: completely held-out language family",
    }
    print(f"  Train (non-Indic): {len(non_indic_ids):,}")
    print(f"  Test (Sanskrit+Pali): {len(indic_ids):,}")

    # ===== SPLIT 10: Semitic -> Indic (v10.9) =====
    print("\n" + "-" * 60)
    print("SPLIT 10: SEMITIC -> INDIC")
    semitic_only_ids = [
        p["id"] for p in passage_meta if p["language"] in ("hebrew", "aramaic", "arabic")
    ]
    indic_only_ids = [p["id"] for p in passage_meta if p["language"] in ("sanskrit", "pali")]
    random.shuffle(semitic_only_ids)
    random.shuffle(indic_only_ids)

    all_splits["semitic_to_indic"] = {
        "train_ids": semitic_only_ids,
        "test_ids": indic_only_ids,
        "train_size": len(semitic_only_ids),
        "test_size": len(indic_only_ids),
        "description": "Semitic → Indo-Aryan transfer",
    }
    print(f"  Train (Semitic): {len(semitic_only_ids):,}")
    print(f"  Test (Indic): {len(indic_only_ids):,}")

    # ===== SPLIT 11: Quran -> Fiqh (v10.9) =====
    print("\n" + "-" * 60)
    print("SPLIT 11: QURAN -> FIQH (religious to legal/philosophical)")
    quranic_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] in ("QURANIC", "HADITH") and p["language"] == "arabic"
    ]
    fiqh_ids = [
        p["id"]
        for p in passage_meta
        if p["time_period"] in ("FIQH", "SUFI", "FALSAFA") and p["language"] == "arabic"
    ]
    random.shuffle(quranic_ids)
    random.shuffle(fiqh_ids)

    all_splits["quran_to_fiqh"] = {
        "train_ids": quranic_ids,
        "test_ids": fiqh_ids,
        "train_size": len(quranic_ids),
        "test_size": len(fiqh_ids),
        "description": "Religious → legal/philosophical Arabic",
    }
    print(f"  Train (Quranic+Hadith): {len(quranic_ids):,}")
    print(f"  Test (Fiqh+Sufi+Falsafa): {len(fiqh_ids):,}")

    # Save splits
    with open("data/splits/all_splits.json", "w") as f:
        json.dump(all_splits, f, indent=2)

    # Save to Drive
    shutil.copy("data/splits/all_splits.json", f"{SAVE_DIR}/all_splits.json")

print("\n" + "=" * 60)
print("Splits saved to local and Drive")
print("=" * 60)


In [None]:
# @title 6. Model Architecture { display-mode: "form" }
# @markdown BIP v10.9 model with configurable backbone and adversarial heads
# @markdown - Updated: 8 languages, 26 periods

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from tqdm.auto import tqdm
import json

print("=" * 60)
print("MODEL ARCHITECTURE")
print("=" * 60)
print(f"Backbone: {BACKBONE} ({MODEL_NAME})")
print(f"Hidden size: {BACKBONE_HIDDEN}")

# Index mappings
BOND_TO_IDX = {bt.name: i for i, bt in enumerate(BondType)}
IDX_TO_BOND = {i: bt.name for i, bt in enumerate(BondType)}
# v10.9: 8 languages (added Sanskrit, Pali, Greek placeholder)
LANG_TO_IDX = {
    "hebrew": 0,
    "aramaic": 1,
    "classical_chinese": 2,
    "arabic": 3,
    "english": 4,
    "sanskrit": 5,  # NEW in v10.9
    "pali": 6,  # NEW in v10.9
    "greek": 7,  # FUTURE (placeholder)
}
IDX_TO_LANG = {i: l for l, i in LANG_TO_IDX.items()}

# v10.9: 26 periods (expanded Chinese, Arabic, added Sanskrit/Pali traditions)
PERIOD_TO_IDX = {
    # Semitic traditions
    "BIBLICAL": 0,
    "TANNAITIC": 1,
    "AMORAIC": 2,
    "RISHONIM": 3,
    "ACHRONIM": 4,
    # Chinese traditions (expanded)
    "CONFUCIAN": 5,
    "DAOIST": 6,
    "MOHIST": 7,  # NEW in v10.9
    "LEGALIST": 8,  # NEW in v10.9
    "BUDDHIST": 9,  # NEW in v10.9 (Chinese Buddhism)
    "NEO_CONFUCIAN": 10,  # NEW in v10.9
    # Arabic/Islamic traditions (expanded)
    "QURANIC": 11,
    "HADITH": 12,
    "FIQH": 13,  # NEW in v10.9 (Islamic jurisprudence)
    "SUFI": 14,  # NEW in v10.9
    "FALSAFA": 15,  # NEW in v10.9 (Arabic philosophy)
    # Sanskrit/Pali traditions (NEW in v10.9)
    "DHARMA": 16,  # Dharmashastra
    "UPANISHAD": 17,
    "GITA": 18,
    "ARTHA": 19,  # Arthashastra
    "PALI": 20,  # Pali Canon
    # Western traditions
    "WESTERN_CLASSICAL": 21,
    "MEDIEVAL": 22,
    # Modern
    "DEAR_ABBY": 23,
    "MODERN": 24,
    "CLASSICAL": 25,  # Generic classical (fallback)
}  # 26 periods total (0-25)
IDX_TO_PERIOD = {i: p for p, i in PERIOD_TO_IDX.items()}
HOHFELD_TO_IDX = {hs.name: i for i, hs in enumerate(HohfeldState)}
IDX_TO_HOHFELD = {i: hs.name for i, hs in enumerate(HohfeldState)}
CONTEXT_TO_IDX = {"prescriptive": 0, "descriptive": 1, "unknown": 2}
IDX_TO_CONTEXT = {i: c for c, i in CONTEXT_TO_IDX.items()}


def get_confidence_weight(conf):
    """Map confidence to sample weight. Handles both string ('high'/'medium'/'low') and numeric (0.0-1.0) values."""
    if isinstance(conf, str):
        return {"high": 2.0, "medium": 1.0, "low": 0.5}.get(conf, 1.0)
    elif isinstance(conf, (int, float)):
        return 2.0 if conf >= 0.8 else 1.0
    return 1.0


class GradientReversalLayer(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.neg() * ctx.alpha, None


class BIPModel(nn.Module):
    def __init__(self, model_name=None, hidden_size=None, z_dim=64):
        super().__init__()
        # Use global config if not specified
        model_name = model_name or MODEL_NAME
        hidden_size = hidden_size or BACKBONE_HIDDEN

        print(f"  Loading encoder: {model_name}")
        self.encoder = AutoModel.from_pretrained(model_name)

        # Get actual hidden size from model config
        actual_hidden = self.encoder.config.hidden_size
        if actual_hidden != hidden_size:
            print(f"  Note: Using actual hidden size {actual_hidden}")
            hidden_size = actual_hidden

        self.hidden_size = hidden_size
        self.model_name = model_name

        # Projection to z_bond space (scales with backbone size)
        proj_hidden = min(512, hidden_size)
        self.z_proj = nn.Sequential(
            nn.Linear(hidden_size, proj_hidden),
            nn.LayerNorm(proj_hidden),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(proj_hidden, z_dim),
        )

        # Task heads
        self.bond_head = nn.Linear(z_dim, len(BondType))
        self.hohfeld_head = nn.Linear(z_dim, len(HohfeldState))

        # Adversarial heads
        self.language_head = nn.Linear(z_dim, len(LANG_TO_IDX))
        self.period_head = nn.Linear(z_dim, len(PERIOD_TO_IDX))

        # Context prediction head (auxiliary task)
        self.context_head = nn.Linear(z_dim, len(CONTEXT_TO_IDX))

        # Count parameters
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        print(f"  Total params: {total_params:,}")
        print(f"  Trainable: {trainable_params:,}")

    def forward(self, input_ids, attention_mask, adv_lambda=1.0):
        enc = self.encoder(input_ids, attention_mask)

        # Handle different pooling strategies
        if hasattr(enc, "pooler_output") and enc.pooler_output is not None:
            pooled = enc.pooler_output
        else:
            pooled = enc.last_hidden_state[:, 0]

        z = self.z_proj(pooled)

        # Bond prediction (main task)
        bond_pred = self.bond_head(z)
        hohfeld_pred = self.hohfeld_head(z)

        # Adversarial predictions (gradient reversal)
        z_rev = GradientReversalLayer.apply(z, adv_lambda)
        language_pred = self.language_head(z_rev)
        period_pred = self.period_head(z_rev)

        return {
            "bond_pred": bond_pred,
            "hohfeld_pred": hohfeld_pred,
            "language_pred": language_pred,
            "period_pred": period_pred,
            "context_pred": self.context_head(z),
            "z": z,
        }

    def get_bond_embedding(self, input_ids, attention_mask):
        """Get z_bond embedding for geometric analysis."""
        enc = self.encoder(input_ids, attention_mask)
        if hasattr(enc, "pooler_output") and enc.pooler_output is not None:
            pooled = enc.pooler_output
        else:
            pooled = enc.last_hidden_state[:, 0]
        return self.z_proj(pooled)


# Initialize tokenizer for selected backbone
print(f"\nLoading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"  Vocab size: {tokenizer.vocab_size:,}")


# Dataset with Hohfeld support
class NativeDataset(Dataset):
    def __init__(self, ids_set, passages_file, bonds_file, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = []

        bonds_by_id = {}
        with open(bonds_file) as fb:
            for line in fb:
                b = json.loads(line)
                bonds_by_id[b["passage_id"]] = b

        with open(passages_file) as fp:
            for line in tqdm(fp, desc="Loading", unit="line"):
                p = json.loads(line)
                if p["id"] in ids_set and p["id"] in bonds_by_id:
                    b = bonds_by_id[p["id"]]
                    self.data.append(
                        {
                            "text": p["text"][:1000],
                            "language": p["language"],
                            "period": p["time_period"],
                            "bond": b.get("bond_type") or b.get("bonds", {}).get("primary_bond"),
                            "hohfeld": None,
                            "context": b.get("context")
                            or b.get("bonds", {}).get("context", "unknown"),
                            "confidence": b.get("confidence")
                            or b.get("bonds", {}).get("confidence", "medium"),
                        }
                    )
        print(f"  Loaded {len(self.data):,} samples")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(
            item["text"],
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "bond_label": BOND_TO_IDX.get(item["bond"], 9),
            "language_label": LANG_TO_IDX.get(item["language"], 4),
            "period_label": PERIOD_TO_IDX.get(item["period"], 9),
            "hohfeld_label": HOHFELD_TO_IDX.get(item["hohfeld"], 0) if item["hohfeld"] else 0,
            "context_label": CONTEXT_TO_IDX.get(item["context"], 2),
            "sample_weight": get_confidence_weight(item["confidence"]),
            "language": item["language"],
            "context": item["context"],
            "confidence": item["confidence"],
            "text": item["text"],  # Raw text for role augmentation
        }


def collate_fn(batch):
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "bond_labels": torch.tensor([x["bond_label"] for x in batch]),
        "language_labels": torch.tensor([x["language_label"] for x in batch]),
        "period_labels": torch.tensor([x["period_label"] for x in batch]),
        "hohfeld_labels": torch.tensor([x["hohfeld_label"] for x in batch]),
        "context_labels": torch.tensor([x["context_label"] for x in batch]),
        "sample_weights": torch.tensor([x["sample_weight"] for x in batch], dtype=torch.float),
        "languages": [x["language"] for x in batch],
        "contexts": [x["context"] for x in batch],
        "confidences": [x["confidence"] for x in batch],
        "texts": [x["text"] for x in batch],  # v10.10: raw texts for role augmentation
    }


print(f"\nArchitecture ready for {BACKBONE}")
print(f"  Bond classes: {len(BondType)}")
print(f"  Languages: {len(LANG_TO_IDX)}")
print("\n" + "=" * 60)


In [None]:
# @title 7. Train BIP Model { display-mode: "form" }
# @markdown Training with tuned adversarial weights and hardware-optimized parameters
# @markdown v10.9: Added new splits (confucian_to_buddhist, all_to_sanskrit, etc.)

# ===== SUPPRESS DATALOADER MULTIPROCESSING WARNINGS =====
# These occur during garbage collection and bypass normal exception handling
import warnings
import sys
import os
import io
import logging
import random
import re

# Method 1: Filter warnings
warnings.filterwarnings("ignore", message=".*can only test a child process.*")
warnings.filterwarnings("ignore", category=UserWarning, module="torch.utils.data")

# Method 2: Suppress logging
logging.getLogger("torch.utils.data.dataloader").setLevel(logging.CRITICAL)


# Method 3: Redirect stderr during DataLoader cleanup (most effective)
class StderrFilter(io.TextIOWrapper):
    """Filters out DataLoader multiprocessing cleanup messages from stderr"""

    def __init__(self, original):
        self.original = original
        self.buffer_lines = []

    def write(self, text):
        # Filter out the specific error patterns
        skip_patterns = [
            "can only test a child process",
            "_MultiProcessingDataLoaderIter.__del__",
            "_shutdown_workers",
            "Exception ignored in:",
            "w.is_alive()",
        ]
        # Buffer multi-line error messages
        if any(p in text for p in skip_patterns):
            return len(text)  # Pretend we wrote it
        # Also skip if it looks like part of a traceback for these errors
        if text.strip().startswith("^") and len(text.strip()) < 80:
            return len(text)
        if text.strip().startswith('File "/usr') and "dataloader.py" in text:
            return len(text)
        if text.strip() == "Traceback (most recent call last):":
            self.buffer_lines = [text]
            return len(text)
        if self.buffer_lines:
            self.buffer_lines.append(text)
            # Check if this is the DataLoader error traceback
            full_msg = "".join(self.buffer_lines)
            if any(p in full_msg for p in skip_patterns):
                self.buffer_lines = []
                return len(text)
            # After 10 lines, flush if not the target error
            if len(self.buffer_lines) > 10:
                for line in self.buffer_lines:
                    self.original.write(line)
                self.buffer_lines = []
        return self.original.write(text)

    def flush(self):
        if self.buffer_lines:
            # Flush any remaining buffered content
            for line in self.buffer_lines:
                self.original.write(line)
            self.buffer_lines = []
        self.original.flush()

    def __getattr__(self, name):
        return getattr(self.original, name)


# Install the stderr filter
_original_stderr = sys.stderr
sys.stderr = StderrFilter(_original_stderr)

# Method 4: Patch the DataLoader cleanup function directly
try:
    import torch.utils.data.dataloader as dl_module

    _original_del = dl_module._MultiProcessingDataLoaderIter.__del__

    def _patched_del(self):
        try:
            _original_del(self)
        except (AssertionError, AttributeError, RuntimeError):
            pass  # Silently ignore cleanup errors

    dl_module._MultiProcessingDataLoaderIter.__del__ = _patched_del
except Exception:
    pass  # If patching fails, the stderr filter will still work

from sklearn.metrics import f1_score
import gc


# @markdown **Splits to train:**
TRAIN_HEBREW_TO_OTHERS = True  # @param {type:"boolean"}
TRAIN_SEMITIC_TO_NON_SEMITIC = True  # @param {type:"boolean"}
TRAIN_ANCIENT_TO_MODERN = True  # @param {type:"boolean"}
TRAIN_MIXED_BASELINE = True  # @param {type:"boolean"}
TRAIN_ABBY_TO_CHINESE = True  # @param {type:"boolean"}

# @markdown **v10.9 New Splits:**
TRAIN_CONFUCIAN_TO_BUDDHIST = True  # @param {type:"boolean"}
TRAIN_CONFUCIAN_TO_LEGALIST = True  # @param {type:"boolean"}
TRAIN_ALL_TO_SANSKRIT = True  # @param {type:"boolean"}
TRAIN_SEMITIC_TO_INDIC = True  # @param {type:"boolean"}
TRAIN_QURAN_TO_FIQH = True  # @param {type:"boolean"}

# @markdown **Hyperparameters:**
LANG_WEIGHT = 0.1  # @param {type:"number"}
PERIOD_WEIGHT = 0.066  # @param {type:"number"}
N_EPOCHS = 10  # @param {type:"integer"}

# @markdown **Context-Aware Training:**
USE_CONFIDENCE_WEIGHTING = True  # @param {type:"boolean"}
# @markdown Weight prescriptive (high confidence) examples 2x in loss

USE_CONTEXT_AUXILIARY = True  # @param {type:"boolean"}
# @markdown Add context prediction as auxiliary training target

CONTEXT_LOSS_WEIGHT = 0.33  # @param {type:"number"}
# @markdown Weight for context prediction loss

STRICT_PRESCRIPTIVE_TEST = False  # @param {type:"boolean"}
# @markdown Only evaluate on prescriptive examples (reduces test set ~97%!)

# @markdown **v10.10: Role-Aware Data Augmentation:**
USE_ROLE_AUGMENTATION = True  # @param {type:"boolean"}
# @markdown Adds contrastive loss for agent/patient role sensitivity
ROLE_AUGMENT_PROB = 0.3  # @param {type:"number"}
# @markdown Probability of augmenting each batch
ROLE_CONTRASTIVE_WEIGHT = 0.2  # @param {type:"number"}
# @markdown Weight for role contrastive loss
ROLE_CONTRASTIVE_MARGIN = 0.5  # @param {type:"number"}
# @markdown Minimum embedding distance for role-swapped pairs


def swap_roles_simple(text, language):
    """Simple role swap using word order reversal for common patterns.
    v10.10: Addresses weak role_swap sensitivity (0.003) from fuzz testing."""
    patterns = {
        "english": [
            (r"(\w+) must (\w+) (\w+)", r"\3 must \2 \1"),
            (r"(\w+) should (\w+) (\w+)", r"\3 should \2 \1"),
            (r"(\w+) shall (\w+) (\w+)", r"\3 shall \2 \1"),
            (r"the (\w+) must (\w+) the (\w+)", r"the \3 must \2 the \1"),
            (r"(\w+) is obligated to (\w+) (\w+)", r"\3 is obligated to \2 \1"),
            (r"(\w+) has a duty to (\w+) (\w+)", r"\3 has a duty to \2 \1"),
        ],
        "hebrew": [
            (r"על (\S+) ל(\S+) את (\S+)", r"על \3 ל\2 את \1"),
        ],
        "classical_chinese": [
            (r"(\S)當(\S)(\S)", r"\3當\2\1"),
            (r"(\S)須(\S)(\S)", r"\3須\2\1"),
            (r"(\S)應(\S)(\S)", r"\3應\2\1"),
        ],
        "arabic": [
            (r"يجب على (\S+) أن (\S+) (\S+)", r"يجب على \3 أن \2 \1"),
            (r"(\S+) عليه أن (\S+) (\S+)", r"\3 عليه أن \2 \1"),
        ],
        "sanskrit": [
            (r"(\S+)ः (\S+)म् (\S+)ति", r"\3ः \2म् \1ति"),
        ],
        "pali": [
            (r"(\S+)o (\S+)aṃ (\S+)ti", r"\3o \2aṃ \1ti"),
        ],
    }

    lang_patterns = patterns.get(language, patterns["english"])
    for pattern, replacement in lang_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            swapped = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
            if swapped != text:
                return swapped
    return None


print("=" * 60)
print("TRAINING BIP MODEL")
print("=" * 60)
print(f"\nSettings:")
print(f"  Backbone:     {BACKBONE}")
print(f"  GPU Tier:     {GPU_TIER}")
print(f"  Batch size:   {BATCH_SIZE}")
print(f"  Workers:      {NUM_WORKERS}")
print(f"  Learning rate: {LR:.2e}")
print(f"  Adv weights:  lang={LANG_WEIGHT}, period={PERIOD_WEIGHT}")
print("(0.01 prevents loss explosion while maintaining invariance)")
print(f"  Confidence weighting: {USE_CONFIDENCE_WEIGHTING}")
print(f"  Context auxiliary: {USE_CONTEXT_AUXILIARY} (weight={CONTEXT_LOSS_WEIGHT})")
print(f"  Strict prescriptive test: {STRICT_PRESCRIPTIVE_TEST}")
print(f"  Role augmentation: {USE_ROLE_AUGMENTATION} (prob={ROLE_AUGMENT_PROB}, weight={ROLE_CONTRASTIVE_WEIGHT})")

# tokenizer loaded in Cell 6 based on BACKBONE selection

with open("data/splits/all_splits.json") as f:
    all_splits = json.load(f)

splits_to_train = []
if TRAIN_HEBREW_TO_OTHERS:
    splits_to_train.append("hebrew_to_others")
if TRAIN_SEMITIC_TO_NON_SEMITIC:
    splits_to_train.append("semitic_to_non_semitic")
if TRAIN_ANCIENT_TO_MODERN:
    splits_to_train.append("ancient_to_modern")
if TRAIN_MIXED_BASELINE:
    splits_to_train.append("mixed_baseline")
if TRAIN_ABBY_TO_CHINESE:
    splits_to_train.append("abby_to_chinese")
# v10.9 new splits
if TRAIN_CONFUCIAN_TO_BUDDHIST:
    splits_to_train.append("confucian_to_buddhist")
if TRAIN_CONFUCIAN_TO_LEGALIST:
    splits_to_train.append("confucian_to_legalist")
if TRAIN_ALL_TO_SANSKRIT:
    splits_to_train.append("all_to_sanskrit")
if TRAIN_SEMITIC_TO_INDIC:
    splits_to_train.append("semitic_to_indic")
if TRAIN_QURAN_TO_FIQH:
    splits_to_train.append("quran_to_fiqh")

print(f"\nTraining {len(splits_to_train)} splits: {splits_to_train}")

all_results = {}
MIN_TEST_SIZE = 100  # Lowered to allow smaller test sets like Chinese

for split_idx, split_name in enumerate(splits_to_train):
    split_start = time.time()
    print("\n" + "=" * 60)
    print(f"[{split_idx+1}/{len(splits_to_train)}] {split_name}")
    print("=" * 60)

    split = all_splits[split_name]
    print(f"Train: {split['train_size']:,} | Test: {split['test_size']:,}")

    if split["test_size"] < MIN_TEST_SIZE:
        print(f"WARNING: Test set only {split['test_size']} samples (need {MIN_TEST_SIZE})")
        print("Skipping this split - results would be unreliable")
        print("To fix: Add more data to the test languages/periods")
        continue

    # Create model with OOM recovery
    def create_model_with_retry():
        """Create model, cleaning up GPU memory if OOM occurs."""
        try:
            return BIPModel().to(device)
        except torch.cuda.OutOfMemoryError:
            print("  OOM on model creation - cleaning up and retrying...")
            # Clean up any existing model in globals
            _g = globals()
            for _var in ["model", "analyzer", "encoder"]:
                if _var in _g and _g[_var] is not None:
                    try:
                        if hasattr(_g[_var], "cpu"):
                            _g[_var].cpu()
                        _g[_var] = None
                    except:
                        pass
            # Force cleanup
            gc.collect()
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            # Retry
            return BIPModel().to(device)

    model = create_model_with_retry()

    train_dataset = NativeDataset(
        set(split["train_ids"]),
        "data/processed/passages.jsonl",
        "data/processed/bonds.jsonl",
        tokenizer,
    )

    test_ids_to_use = split["test_ids"][:MAX_TEST_SAMPLES]

    # Optional: strict prescriptive-only test
    if STRICT_PRESCRIPTIVE_TEST:
        print("Filtering to prescriptive examples only...")
        # Load bonds to filter
        prescriptive_ids = set()
        with open("data/processed/bonds.jsonl") as f:
            for line in f:
                b = json.loads(line)
                if b.get("context") == "prescriptive":
                    prescriptive_ids.add(b["passage_id"])
        test_ids_to_use = [tid for tid in test_ids_to_use if tid in prescriptive_ids]
        print(f"  Filtered to {len(test_ids_to_use):,} prescriptive samples")

    test_dataset = NativeDataset(
        set(test_ids_to_use),
        "data/processed/passages.jsonl",
        "data/processed/bonds.jsonl",
        tokenizer,
    )

    if len(train_dataset) == 0:
        print("ERROR: No training data!")
        continue

    # Use hardware-optimized batch size
    actual_batch = min(BATCH_SIZE, max(32, len(train_dataset) // 20))
    print(f"Actual batch size: {actual_batch}")

    train_loader = DataLoader(
        train_dataset,
        batch_size=actual_batch,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=True,
        num_workers=0,
        pin_memory=True,
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=actual_batch * 2,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=0,
        pin_memory=True,
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

    def get_adv_lambda(epoch, warmup=3):
        """Ramp adversarial strength: 0.1 -> 1.0 over warmup, then hold at 1.0"""
        if epoch <= warmup:
            return 0.1 + 0.9 * (epoch / warmup)
        return 1.0

    best_loss = float("inf")
    start_epoch = 1

    # Check for existing checkpoint to resume from
    checkpoint_path = f"models/checkpoints/latest_{split_name}.pt"
    if os.path.exists(checkpoint_path):
        print(f"  Found checkpoint, resuming...")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        start_epoch = checkpoint["epoch"] + 1
        best_loss = checkpoint["best_loss"]
        print(f"  Resuming from epoch {start_epoch}, best_loss={best_loss:.4f}")

    for epoch in range(start_epoch, N_EPOCHS + 1):
        model.train()
        total_loss = 0
        n_batches = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch}", leave=False):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            bond_labels = batch["bond_labels"].to(device)
            language_labels = batch["language_labels"].to(device)
            period_labels = batch["period_labels"].to(device)

            adv_lambda = get_adv_lambda(epoch)

            # Use new autocast API
            with torch.amp.autocast("cuda", enabled=USE_AMP):
                out = model(input_ids, attention_mask, adv_lambda=adv_lambda)

                # Weighted bond loss
                if USE_CONFIDENCE_WEIGHTING:
                    sample_weights = batch["sample_weights"].to(device)
                    loss_bond = F.cross_entropy(out["bond_pred"], bond_labels, reduction="none")
                    loss_bond = (loss_bond * sample_weights).mean()
                else:
                    loss_bond = F.cross_entropy(out["bond_pred"], bond_labels)

                # Context auxiliary loss
                if USE_CONTEXT_AUXILIARY:
                    context_labels = batch["context_labels"].to(device)
                    loss_context = F.cross_entropy(out["context_pred"], context_labels)
                else:
                    loss_context = 0

                loss_lang = F.cross_entropy(out["language_pred"], language_labels)
                loss_period = F.cross_entropy(out["period_pred"], period_labels)

            loss = (
                loss_bond
                + LANG_WEIGHT * loss_lang
                + PERIOD_WEIGHT * loss_period
                + CONTEXT_LOSS_WEIGHT * loss_context
            )

            # v10.10: Role contrastive loss for agent/patient sensitivity
            loss_role = torch.tensor(0.0, device=device)
            if USE_ROLE_AUGMENTATION and random.random() < ROLE_AUGMENT_PROB:
                batch_texts = batch.get("texts", [])
                batch_languages = batch.get("languages", [])

                swapped_texts = []
                original_indices = []

                for i, (text, lang) in enumerate(zip(batch_texts, batch_languages)):
                    swapped = swap_roles_simple(text, lang)
                    if swapped:
                        swapped_texts.append(swapped)
                        original_indices.append(i)

                if swapped_texts and len(swapped_texts) >= 2:
                    # Tokenize swapped texts
                    swapped_encoded = tokenizer(
                        swapped_texts,
                        padding=True,
                        truncation=True,
                        max_length=128,
                        return_tensors="pt",
                    )
                    swapped_ids = swapped_encoded["input_ids"].to(device)
                    swapped_mask = swapped_encoded["attention_mask"].to(device)

                    # Get embeddings for swapped texts (no adversarial)
                    swapped_out = model(swapped_ids, swapped_mask, adv_lambda=0)

                    # Get original embeddings for corresponding indices
                    z_original = out["z"][original_indices]
                    z_swapped = swapped_out["z"]

                    # Contrastive loss: push role-swapped embeddings apart
                    # Hinge loss: max(0, margin - distance)
                    distances = F.pairwise_distance(z_original, z_swapped)
                    loss_role = F.relu(ROLE_CONTRASTIVE_MARGIN - distances).mean()

                    # Clean up
                    del swapped_ids, swapped_mask, swapped_out, z_original, z_swapped, distances

            loss = loss + ROLE_CONTRASTIVE_WEIGHT * loss_role

            if USE_AMP and scaler:
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            total_loss += loss.item()
            n_batches += 1

            # Delete intermediate tensors to prevent memory accumulation
            del input_ids, attention_mask, bond_labels, language_labels, period_labels
            del out, loss, loss_bond, loss_lang, loss_period
            if USE_CONFIDENCE_WEIGHTING:
                del sample_weights
            if USE_CONTEXT_AUXILIARY:
                del context_labels, loss_context
            if USE_ROLE_AUGMENTATION:
                del loss_role

        avg_loss = total_loss / n_batches

        # Aggressive memory cleanup after each epoch
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

        if torch.cuda.is_available():
            mem_alloc = torch.cuda.memory_allocated() / 1e9
            mem_reserved = torch.cuda.memory_reserved() / 1e9
            print(
                f"Epoch {epoch}: Loss={avg_loss:.4f} (adv_lambda={adv_lambda:.2f}) [GPU: {mem_alloc:.1f}GB alloc, {mem_reserved:.1f}GB reserved]"
            )
        else:
            print(f"Epoch {epoch}: Loss={avg_loss:.4f} (adv_lambda={adv_lambda:.2f})")

        # Save checkpoint every epoch (for crash recovery)
        checkpoint = {
            "epoch": epoch,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "loss": avg_loss,
            "best_loss": best_loss,
        }
        torch.save(checkpoint, f"models/checkpoints/latest_{split_name}.pt")

        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), f"models/checkpoints/best_{split_name}.pt")
            torch.save(model.state_dict(), f"{SAVE_DIR}/best_{split_name}.pt")

    # Evaluate
    print("\nEvaluating...")
    model.load_state_dict(torch.load(f"models/checkpoints/best_{split_name}.pt"))
    model.eval()

    all_preds = {"bond": [], "lang": []}
    all_labels = {"bond": [], "lang": []}
    all_languages = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            out = model(batch["input_ids"].to(device), batch["attention_mask"].to(device), 0)
            all_preds["bond"].extend(out["bond_pred"].argmax(-1).cpu().tolist())
            all_preds["lang"].extend(out["language_pred"].argmax(-1).cpu().tolist())
            all_labels["bond"].extend(batch["bond_labels"].tolist())
            all_labels["lang"].extend(batch["language_labels"].tolist())
            all_languages.extend(batch["languages"])

    bond_f1 = f1_score(all_labels["bond"], all_preds["bond"], average="macro", zero_division=0)
    bond_acc = sum(p == l for p, l in zip(all_preds["bond"], all_labels["bond"])) / len(
        all_preds["bond"]
    )
    lang_acc = sum(p == l for p, l in zip(all_preds["lang"], all_labels["lang"])) / len(
        all_preds["lang"]
    )

    # Per-language F1
    lang_f1 = {}
    for lang in set(all_languages):
        mask = [l == lang for l in all_languages]
        if sum(mask) > 10:
            preds = [p for p, m in zip(all_preds["bond"], mask) if m]
            labels = [l for l, m in zip(all_labels["bond"], mask) if m]
            lang_f1[lang] = {
                "f1": f1_score(labels, preds, average="macro", zero_division=0),
                "n": sum(mask),
            }

    all_results[split_name] = {
        "bond_f1_macro": bond_f1,
        "bond_acc": bond_acc,
        "language_acc": lang_acc,
        "per_language_f1": lang_f1,
        "training_time": time.time() - split_start,
    }

    print(f"\n{split_name} RESULTS:")
    print(f"  Bond F1 (macro): {bond_f1:.3f} ({bond_f1/0.1:.1f}x chance)")
    print(f"  Bond accuracy:   {bond_acc:.1%}")
    print(f"  Language acc:    {lang_acc:.1%} (want ~20% = invariant)")
    print("  Per-language:")
    for lang, m in sorted(lang_f1.items(), key=lambda x: -x[1]["n"]):
        print(f"    {lang:20s}: F1={m['f1']:.3f} (n={m['n']:,})")

    # Context analysis
    high_conf = sum(1 for c in test_dataset.data if c["confidence"] == "high")
    prescriptive = sum(1 for c in test_dataset.data if c["context"] == "prescriptive")
    print(
        f"  Context: {prescriptive:,}/{len(test_dataset):,} prescriptive ({prescriptive/len(test_dataset)*100:.1f}%)"
    )
    print(
        f"  High confidence: {high_conf:,}/{len(test_dataset):,} ({high_conf/len(test_dataset)*100:.1f}%)"
    )

    # GPU memory usage before cleanup
    if torch.cuda.is_available():
        mem = torch.cuda.memory_allocated() / 1e9
        print(
            f"\n  GPU memory (before cleanup): {mem:.1f} GB / {VRAM_GB:.1f} GB ({mem/VRAM_GB*100:.0f}%)"
        )

    # Aggressive memory cleanup between splits
    # Step 1: Move model to CPU to release GPU memory
    model.cpu()

    # Step 2: Delete all references
    del model, train_dataset, test_dataset, train_loader, test_loader, optimizer
    if USE_AMP and scaler:
        del scaler

    # Step 3: Force garbage collection (multiple passes)
    for _ in range(3):
        gc.collect()

    # Step 4: Clear CUDA cache
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    # Step 5: Re-create scaler for next split
    if USE_AMP:
        scaler = torch.amp.GradScaler("cuda")

    # GPU memory after cleanup
    if torch.cuda.is_available():
        mem_after = torch.cuda.memory_allocated() / 1e9
        print(f"  GPU memory (after cleanup): {mem_after:.1f} GB (freed {mem - mem_after:.1f} GB)")
        if mem_after > 1.0:
            print(f"  WARNING: {mem_after:.1f} GB still allocated - may cause OOM on next split")

print("\n" + "=" * 60)
print("TRAINING COMPLETE")
print("=" * 60)


In [None]:
# @title 8. Geometric Analysis & Linear Probe { display-mode: "form" }
# @markdown v10.9: New geometric analysis module + linear probe test
# @markdown Tests latent space structure (axis discovery, role swap analysis)
# @markdown Tests if z_bond encodes language/period (should be low = invariant)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import numpy as np
from typing import List, Dict, Tuple


# ===== v10.9: GEOMETRIC ANALYZER CLASS =====
class GeometricAnalyzer:
    """
    Probe the latent space geometry to discover moral structure.
    """

    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    @torch.no_grad()
    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, max_length=128, padding="max_length"
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        z = self.model.get_bond_embedding(inputs["input_ids"], inputs["attention_mask"])
        return z.cpu().numpy().flatten()

    def find_direction(self, positive_texts: List[str], negative_texts: List[str]) -> np.ndarray:
        """
        Find the direction in z-space that separates two concepts.
        E.g., obligation vs permission, harm vs care.
        """
        pos_embs = np.array([self.get_embedding(t) for t in positive_texts])
        neg_embs = np.array([self.get_embedding(t) for t in negative_texts])

        pos_mean = pos_embs.mean(axis=0)
        neg_mean = neg_embs.mean(axis=0)

        direction = pos_mean - neg_mean
        direction = direction / (np.linalg.norm(direction) + 1e-9)
        return direction

    def test_direction_transfer(
        self, direction: np.ndarray, test_pairs: List[Tuple[str, str]]
    ) -> float:
        """
        Test if a direction generalizes to new examples.
        Returns accuracy of direction-based classification.
        """
        scores = []
        for pos_text, neg_text in test_pairs:
            pos_proj = np.dot(self.get_embedding(pos_text), direction)
            neg_proj = np.dot(self.get_embedding(neg_text), direction)
            scores.append(1.0 if pos_proj > neg_proj else 0.0)
        return np.mean(scores)

    def pca_on_pairs(self, concept_pairs: Dict[str, List[Tuple[str, str]]]) -> Dict:
        """
        Run PCA on difference vectors to find dominant axes.

        concept_pairs: {"obligation_permission": [(obl1, perm1), ...], ...}
        """
        all_diffs = []
        labels = []

        for concept, pairs in concept_pairs.items():
            for pos, neg in pairs:
                diff = self.get_embedding(pos) - self.get_embedding(neg)
                all_diffs.append(diff)
                labels.append(concept)

        X = np.array(all_diffs)

        pca = PCA(n_components=min(10, len(X)))
        pca.fit(X)

        return {
            "components": pca.components_,
            "explained_variance_ratio": pca.explained_variance_ratio_,
            "labels": labels,
            "transformed": pca.transform(X),
        }

    def role_swap_analysis(self, agent_patient_pairs: List[Tuple[str, str]]) -> Dict:
        """
        Test if swapping agent/patient produces consistent transformation.

        agent_patient_pairs: [("A harmed B", "B harmed A"), ...]
        """
        transformations = []

        for original, swapped in agent_patient_pairs:
            orig_emb = self.get_embedding(original)
            swap_emb = self.get_embedding(swapped)
            transformations.append(swap_emb - orig_emb)

        T = np.array(transformations)

        # Check consistency: are all transformations similar?
        mean_transform = T.mean(axis=0)
        cosines = [
            np.dot(t, mean_transform) / (np.linalg.norm(t) * np.linalg.norm(mean_transform) + 1e-9)
            for t in T
        ]

        return {
            "mean_transform": mean_transform,
            "consistency": np.mean(cosines),
            "consistency_std": np.std(cosines),
        }


print("=" * 60)
print("LINEAR PROBE TEST")
print("=" * 60)
print("\nIf probe accuracy is NEAR CHANCE, representation is INVARIANT")
print("(This is what we want for BIP)")

probe_results = {}

for split_name in ["hebrew_to_others", "semitic_to_non_semitic"]:
    model_path = f"{SAVE_DIR}/best_{split_name}.pt"
    if not os.path.exists(model_path):
        print(f"\nSkipping {split_name} - no saved model")
        continue

    print(f"\n{'='*50}")
    print(f"PROBE: {split_name}")
    print("=" * 50)

    model = BIPModel().to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    test_ids = set(all_splits[split_name]["test_ids"][:5000])
    test_dataset = NativeDataset(
        test_ids, "data/processed/passages.jsonl", "data/processed/bonds.jsonl", tokenizer
    )

    if len(test_dataset) < 50:
        print(f"  Skip - only {len(test_dataset)} samples")
        continue

    test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn, num_workers=0)

    all_z, all_lang, all_period = [], [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Extract"):
            out = model(batch["input_ids"].to(device), batch["attention_mask"].to(device), 0)
            all_z.append(out["z"].cpu().numpy())
            all_lang.extend(batch["language_labels"].tolist())
            all_period.extend(batch["period_labels"].tolist())

    X = np.vstack(all_z)
    y_lang = np.array(all_lang)
    y_period = np.array(all_period)

    scaler_probe = StandardScaler()
    X_scaled = scaler_probe.fit_transform(X)

    # Train/test split for probes
    n = len(X)
    idx = np.random.permutation(n)
    train_idx, test_idx = idx[: int(0.7 * n)], idx[int(0.7 * n) :]

    # Language probe - check for multiple classes
    unique_langs = np.unique(y_lang[test_idx])
    if len(unique_langs) < 2:
        print(f"  SKIP language probe - only {len(unique_langs)} class")
        lang_acc = 1.0 / max(1, len(np.unique(y_lang)))
        lang_chance = lang_acc
    else:
        lang_probe = LogisticRegression(max_iter=1000, n_jobs=-1)
        lang_probe.fit(X_scaled[train_idx], y_lang[train_idx])
        lang_acc = (lang_probe.predict(X_scaled[test_idx]) == y_lang[test_idx]).mean()
        lang_chance = 1.0 / len(unique_langs)

    # Period probe - same check
    unique_periods = np.unique(y_period[test_idx])
    if len(unique_periods) < 2:
        print(f"  SKIP period probe - only {len(unique_periods)} class")
        period_acc = 1.0 / max(1, len(np.unique(y_period)))
        period_chance = period_acc
    else:
        period_probe = LogisticRegression(max_iter=1000, n_jobs=-1)
        period_probe.fit(X_scaled[train_idx], y_period[train_idx])
        period_acc = (period_probe.predict(X_scaled[test_idx]) == y_period[test_idx]).mean()
        period_chance = 1.0 / len(unique_periods)

    lang_status = "INVARIANT" if lang_acc < lang_chance + 0.15 else "NOT invariant"
    period_status = "INVARIANT" if period_acc < period_chance + 0.15 else "NOT invariant"

    probe_results[split_name] = {
        "language_acc": lang_acc,
        "language_chance": lang_chance,
        "language_status": lang_status,
        "period_acc": period_acc,
        "period_chance": period_chance,
        "period_status": period_status,
    }

    print(f"\nRESULTS:")
    print(f"  Language: {lang_acc:.1%} (chance: {lang_chance:.1%}) -> {lang_status}")
    print(f"  Period:   {period_acc:.1%} (chance: {period_chance:.1%}) -> {period_status}")

    del model
    torch.cuda.empty_cache()

print("\n" + "=" * 60)
print("Probe tests complete")
print("=" * 60)

# ===== v10.9: GEOMETRIC ANALYSIS =====
print("\n" + "=" * 60)
print("GEOMETRIC ANALYSIS (v10.9)")
print("=" * 60)
print("\nDiscovering interpretable axes in latent space...")

# Test pairs for axis discovery (cross-lingual)
OBLIGATION_PERMISSION_TRAIN = [
    # English - training set
    ("You must help the elderly", "You may help the elderly"),
    ("He is required to pay", "He is allowed to pay"),
    ("Parents must protect children", "Parents may protect children"),
]

OBLIGATION_PERMISSION_TEST = [
    # Chinese
    ("君子必孝", "君子可孝"),  # Gentleman must/may be filial
    ("民必從法", "民可從法"),  # People must/may follow law
    # Arabic
    ("يجب عليك أن تساعد", "يجوز لك أن تساعد"),  # You must/may help
    # Hebrew
    ("חייב לכבד", "מותר לכבד"),  # Obligated/permitted to honor
    # English - held out
    ("She must attend", "She may attend"),
]

HARM_CARE_PAIRS = [
    ("He injured the child", "He protected the child"),
    ("殺人者", "救人者"),  # One who kills / one who saves
    ("ظلم الضعيف", "رحم الضعيف"),  # Oppressed / showed mercy to the weak
    ("She hurt the patient", "She healed the patient"),
]

ROLE_SWAP_PAIRS = [
    ("The master commands the servant", "The servant commands the master"),
    ("君命臣", "臣命君"),  # Lord commands minister / minister commands lord
    ("الأب يأمر الابن", "الابن يأمر الأب"),  # Father commands son / son commands father
    ("The parent guides the child", "The child guides the parent"),
]

geometry_results = {}

# Use the best model from mixed_baseline split for geometric analysis
model_path = f"{SAVE_DIR}/best_mixed_baseline.pt"
if os.path.exists(model_path):
    print("\nLoading model for geometric analysis...")
    model = BIPModel().to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    analyzer = GeometricAnalyzer(model, tokenizer, device)

    # 1. Find obligation/permission axis
    print("\n--- Obligation/Permission Axis ---")
    obl_texts = [p[0] for p in OBLIGATION_PERMISSION_TRAIN]
    perm_texts = [p[1] for p in OBLIGATION_PERMISSION_TRAIN]
    obl_perm_axis = analyzer.find_direction(obl_texts, perm_texts)

    # Test transfer to other languages
    transfer_acc = analyzer.test_direction_transfer(obl_perm_axis, OBLIGATION_PERMISSION_TEST)
    print(f"  Direction found from English training pairs")
    print(f"  Transfer accuracy to other languages: {transfer_acc:.1%}")
    axis_status = "STRONG" if transfer_acc > 0.8 else "WEAK" if transfer_acc > 0.5 else "FAILED"
    print(f"  Status: {axis_status} deontic axis")

    geometry_results["obligation_permission"] = {
        "transfer_accuracy": transfer_acc,
        "status": axis_status,
    }

    # 2. Find harm/care axis
    print("\n--- Harm/Care Axis ---")
    harm_texts = [p[0] for p in HARM_CARE_PAIRS]
    care_texts = [p[1] for p in HARM_CARE_PAIRS]
    harm_care_axis = analyzer.find_direction(harm_texts, care_texts)

    # Check axis orthogonality
    axis_correlation = abs(np.dot(obl_perm_axis, harm_care_axis))
    print(f"  Axis found")
    print(f"  Correlation with obl/perm axis: {axis_correlation:.3f}")
    orthogonal = "ORTHOGONAL" if axis_correlation < 0.3 else "CORRELATED"
    print(f"  Status: {orthogonal}")

    geometry_results["harm_care"] = {
        "axis_correlation": axis_correlation,
        "orthogonal": axis_correlation < 0.3,
    }

    # 3. Role swap analysis
    print("\n--- Role Swap Analysis ---")
    role_analysis = analyzer.role_swap_analysis(ROLE_SWAP_PAIRS)
    print(
        f"  Mean consistency: {role_analysis['consistency']:.3f} +/- {role_analysis['consistency_std']:.3f}"
    )
    role_status = "CONSISTENT" if role_analysis["consistency"] > 0.9 else "VARIABLE"
    print(f"  Status: {role_status} agent/patient transformation")

    geometry_results["role_swap"] = {
        "consistency": role_analysis["consistency"],
        "consistency_std": role_analysis["consistency_std"],
        "status": role_status,
    }

    # 4. PCA on all structural pairs
    print("\n--- PCA Analysis ---")
    all_concept_pairs = {
        "obligation_permission": OBLIGATION_PERMISSION_TRAIN + OBLIGATION_PERMISSION_TEST,
        "harm_care": HARM_CARE_PAIRS,
    }
    pca_results = analyzer.pca_on_pairs(all_concept_pairs)

    cumsum = np.cumsum(pca_results["explained_variance_ratio"])
    n_components_90 = np.argmax(cumsum > 0.9) + 1 if any(cumsum > 0.9) else len(cumsum)

    print(f"  Explained variance ratio: {pca_results['explained_variance_ratio'][:5]}")
    print(f"  Components for 90% variance: {n_components_90}")
    pca_status = "LOW-DIM" if n_components_90 <= 3 else "HIGH-DIM"
    print(f"  Status: {pca_status} moral structure")

    geometry_results["pca"] = {
        "explained_variance": pca_results["explained_variance_ratio"].tolist(),
        "n_components_90pct": n_components_90,
        "status": pca_status,
    }

    del model
    torch.cuda.empty_cache()
else:
    print(f"\nSkipping geometric analysis - no model at {model_path}")
    geometry_results = {"error": "No model available"}

print("\n" + "=" * 60)
print("Geometric analysis complete")
print("=" * 60)


In [None]:
# @title 9. Fuzz Testing & Final Results { display-mode: "form" }
# @markdown v10.9: Structural vs Surface fuzz testing + comprehensive summary
# @markdown Tests if model responds to moral structure (good) vs surface features (bad)

import json
import shutil
from scipy import stats

print("=" * 60)
print("FUZZ TESTING (v10.9)")
print("=" * 60)
print("\nTesting: structural changes should move embeddings,")
print("         surface changes should NOT move embeddings.")


# ===== STRUCTURAL FUZZ TEST CLASS =====
class StructuralFuzzTest:
    """
    Extended fuzz testing with cross-lingual pairs.
    """

    # Structural perturbation templates (language-agnostic concepts)
    STRUCTURAL_PAIRS = {
        "obligation_to_permission": [
            # English
            ("You must help the elderly", "You may help the elderly"),
            ("He is required to pay", "He is allowed to pay"),
            ("Parents must protect children", "Parents may protect children"),
            # Chinese
            ("君子必孝", "君子可孝"),  # Gentleman must/may be filial
            ("民必從法", "民可從法"),  # People must/may follow law
            # Arabic
            ("يجب عليك أن تساعد", "يجوز لك أن تساعد"),  # You must/may help
            # Hebrew
            ("חייב לכבד", "מותר לכבד"),  # Obligated/permitted to honor
        ],
        "harm_to_care": [
            ("He injured the child", "He protected the child"),
            ("殺人者", "救人者"),  # One who kills / one who saves
            ("ظلم الضعيف", "رحم الضعيف"),  # Oppressed / showed mercy to the weak
        ],
        "role_swap": [
            ("The master commands the servant", "The servant commands the master"),
            ("君命臣", "臣命君"),  # Lord commands minister / minister commands lord
            ("الأب يأمر الابن", "الابن يأمر الأب"),  # Father commands son / son commands father
        ],
        "violation_to_fulfillment": [
            ("He broke his promise", "He kept his promise"),
            ("違約", "守約"),  # Violate contract / keep contract
            ("نقض العهد", "وفى بالعهد"),  # Broke covenant / fulfilled covenant
        ],
    }

    # Surface perturbation templates (should NOT move embeddings)
    SURFACE_PERTURBATIONS = {
        "name_change": lambda t: t.replace("John", "Michael").replace("Mary", "Lisa"),
        "irrelevant_detail": lambda t: t + " It was Tuesday.",
        "add_location": lambda t: t + " in the city.",
    }

    def run_comprehensive_test(self, analyzer) -> dict:
        """
        Run full structural vs surface test battery.
        """
        results = {}

        # Test structural perturbations
        for perturbation_type, pairs in self.STRUCTURAL_PAIRS.items():
            distances = []
            for text1, text2 in pairs:
                emb1 = analyzer.get_embedding(text1)
                emb2 = analyzer.get_embedding(text2)
                # Cosine distance
                dist = 1 - np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2) + 1e-9)
                distances.append(dist)

            results[f"structural_{perturbation_type}"] = {
                "mean_distance": np.mean(distances),
                "std": np.std(distances),
                "n": len(distances),
            }

        # Surface perturbations on base sentences
        base_sentences = [
            "John borrowed money from Mary and must repay it.",
            "The doctor has a duty to help patients.",
            "Parents should protect their children.",
        ]

        surface_distances = []
        for base in base_sentences:
            base_emb = analyzer.get_embedding(base)
            for name, perturb_fn in self.SURFACE_PERTURBATIONS.items():
                perturbed = perturb_fn(base)
                if perturbed != base:
                    perturbed_emb = analyzer.get_embedding(perturbed)
                    dist = 1 - np.dot(base_emb, perturbed_emb) / (
                        np.linalg.norm(base_emb) * np.linalg.norm(perturbed_emb) + 1e-9
                    )
                    surface_distances.append(dist)

        results["surface_all"] = {
            "mean_distance": np.mean(surface_distances) if surface_distances else 0,
            "std": np.std(surface_distances) if surface_distances else 0,
            "n": len(surface_distances),
        }

        # Statistical comparison
        structural_all = []
        for k, v in results.items():
            if k.startswith("structural_"):
                structural_all.extend([v["mean_distance"]] * v["n"])

        if structural_all and surface_distances:
            t_stat, p_value = stats.ttest_ind(structural_all, surface_distances)
        else:
            t_stat, p_value = 0, 1.0

        results["comparison"] = {
            "structural_mean": np.mean(structural_all) if structural_all else 0,
            "surface_mean": np.mean(surface_distances) if surface_distances else 0,
            "ratio": (
                np.mean(structural_all) / (np.mean(surface_distances) + 1e-9)
                if structural_all
                else 0
            ),
            "t_statistic": t_stat,
            "p_value": p_value,
        }

        return results


# Run fuzz test if model is available
fuzz_results = {}
model_path = f"{SAVE_DIR}/best_mixed_baseline.pt"

if os.path.exists(model_path):
    print("\nLoading model for fuzz testing...")
    model = BIPModel().to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Reuse GeometricAnalyzer from cell 8
    analyzer = GeometricAnalyzer(model, tokenizer, device)
    fuzz_test = StructuralFuzzTest()

    print("\nRunning structural vs surface comparison...")
    fuzz_results = fuzz_test.run_comprehensive_test(analyzer)

    print("\n--- Structural Perturbations (should be HIGH) ---")
    for k, v in fuzz_results.items():
        if k.startswith("structural_"):
            print(f"  {k}: distance={v['mean_distance']:.4f} +/- {v['std']:.4f} (n={v['n']})")

    print("\n--- Surface Perturbations (should be LOW) ---")
    v = fuzz_results["surface_all"]
    print(f"  surface_all: distance={v['mean_distance']:.4f} +/- {v['std']:.4f} (n={v['n']})")

    print("\n--- Statistical Comparison ---")
    c = fuzz_results["comparison"]
    print(f"  Structural mean: {c['structural_mean']:.4f}")
    print(f"  Surface mean:    {c['surface_mean']:.4f}")
    print(f"  Ratio:           {c['ratio']:.2f}x")
    print(f"  t-statistic:     {c['t_statistic']:.2f}")
    print(f"  p-value:         {c['p_value']:.4f}")

    # Interpret results
    if c["ratio"] > 2.0 and c["p_value"] < 0.05:
        fuzz_status = "EXCELLENT"
        fuzz_msg = "Model strongly distinguishes structural from surface"
    elif c["ratio"] > 1.5:
        fuzz_status = "GOOD"
        fuzz_msg = "Model distinguishes structural from surface"
    elif c["ratio"] > 1.0:
        fuzz_status = "MARGINAL"
        fuzz_msg = "Some structural sensitivity"
    else:
        fuzz_status = "FAILED"
        fuzz_msg = "Model may be using surface features"

    print(f"\n  FUZZ STATUS: {fuzz_status}")
    print(f"  {fuzz_msg}")

    del model
    torch.cuda.empty_cache()
else:
    print(f"\nSkipping fuzz test - no model at {model_path}")
    fuzz_status = "SKIPPED"

print("\n" + "=" * 60)
print("FINAL BIP EVALUATION (v10.9)")
print("=" * 60)

print(f"\nHardware: {GPU_TIER} ({VRAM_GB:.0f}GB VRAM, {RAM_GB:.0f}GB RAM)")

print("\n" + "-" * 60)
print("CROSS-DOMAIN TRANSFER RESULTS")
print("-" * 60)

successful_splits = []
for name, r in all_results.items():
    ratio = r["bond_f1_macro"] / 0.1
    lang_acc = r["language_acc"]

    transfer_ok = ratio > 1.3
    invariant_ok = lang_acc < 0.35  # Near chance (20%)

    status = "SUCCESS" if (transfer_ok and invariant_ok) else "PARTIAL" if transfer_ok else "FAIL"

    print(f"\n{name}:")
    print(
        f"  Bond F1:     {r['bond_f1_macro']:.3f} ({ratio:.1f}x chance) {'OK' if transfer_ok else 'WEAK'}"
    )
    print(f"  Language:    {lang_acc:.1%} {'INVARIANT' if invariant_ok else 'LEAKING'}")
    print(f"  -> {status}")

    if transfer_ok and invariant_ok:
        successful_splits.append(name)

print("\n" + "-" * 60)
print("VERDICT")
print("-" * 60)

n_success = len(successful_splits)
if n_success >= 3:
    verdict = "STRONGLY_SUPPORTED"
    msg = "Multiple independent transfer paths demonstrate universal structure"
elif n_success >= 2:
    verdict = "SUPPORTED"
    msg = "Multiple transfer paths work"
elif n_success >= 1:
    verdict = "PARTIALLY_SUPPORTED"
    msg = "At least one transfer path works"
elif any(r["bond_f1_macro"] > 0.13 for r in all_results.values()):
    verdict = "WEAK"
    msg = "Some transfer signal, but not robust"
else:
    verdict = "INCONCLUSIVE"
    msg = "No clear transfer demonstrated"

print(f"\n  Successful transfers: {n_success}/{len(all_results)}")
print(f"  Splits: {successful_splits if successful_splits else 'None'}")
print(f"\n  VERDICT: {verdict}")
print(f"  {msg}")

# v10.9 specific checks
print("\n" + "-" * 60)
print("v10.9 SPECIFIC CRITERIA")
print("-" * 60)

# Check key v10.9 splits
v109_checks = {
    "confucian_to_buddhist": "Chinese diversity test",
    "all_to_sanskrit": "Sanskrit transfer test",
    "quran_to_fiqh": "Arabic improvement test",
}

for split_name, test_name in v109_checks.items():
    if split_name in all_results:
        r = all_results[split_name]
        f1 = r["bond_f1_macro"]
        threshold = 0.4 if "sanskrit" in split_name else 0.5
        status = "PASS" if f1 >= threshold else "FAIL"
        print(f"  {test_name}: F1={f1:.3f} (need {threshold}) -> {status}")
    else:
        print(f"  {test_name}: NOT RUN")

# Geometry results
if "geometry_results" in dir() and geometry_results:
    print("\n  Geometric Analysis:")
    if "obligation_permission" in geometry_results:
        acc = geometry_results["obligation_permission"].get("transfer_accuracy", 0)
        print(f"    Deontic axis transfer: {acc:.1%} (need 80%)")
    if "pca" in geometry_results:
        n_comp = geometry_results["pca"].get("n_components_90pct", 0)
        print(f"    PCA components for 90%: {n_comp} (need ≤3)")

# Fuzz results
if fuzz_results and "comparison" in fuzz_results:
    print(f"\n  Fuzz Test: {fuzz_status}")
    print(f"    Structural/Surface ratio: {fuzz_results['comparison']['ratio']:.2f}x")

# Save results
final_results = {
    "version": "v10.9",
    "all_results": all_results,
    "probe_results": probe_results if "probe_results" in dir() else {},
    "geometry_results": geometry_results if "geometry_results" in dir() else {},
    "fuzz_results": fuzz_results,
    "successful_splits": successful_splits,
    "verdict": verdict,
    "hardware": {"gpu": GPU_TIER, "vram_gb": VRAM_GB, "ram_gb": RAM_GB},
    "settings": {
        "batch_size": BATCH_SIZE,
        "max_per_lang": MAX_PER_LANG,
        "num_workers": NUM_WORKERS,
    },
    "experiment_time": time.time() - EXPERIMENT_START,
}

with open("results/final_results.json", "w") as f:
    json.dump(final_results, f, indent=2, default=str)
shutil.copy("results/final_results.json", f"{SAVE_DIR}/final_results.json")

print(f"\nTotal time: {(time.time() - EXPERIMENT_START)/60:.1f} minutes")
print("Results saved to Drive!")
print("\n" + "=" * 60)


In [None]:
# @title 10. Save & Download Results { display-mode: "form" }
# @markdown Persist results to Google Drive and optionally download as zip

import shutil

print("=" * 60)
print("SAVING RESULTS")
print("=" * 60)

# Always persist results to Drive
if SAVE_DIR and os.path.exists(SAVE_DIR):
    print(f"\nPersisting to: {SAVE_DIR}")

    # Save final results JSON
    if os.path.exists("results/final_results.json"):
        dest = f"{SAVE_DIR}/final_results.json"
        shutil.copy("results/final_results.json", dest)
        print(f"  Saved: final_results.json")

    # Save splits config
    if os.path.exists("data/splits/all_splits.json"):
        dest = f"{SAVE_DIR}/all_splits.json"
        shutil.copy("data/splits/all_splits.json", dest)
        print(f"  Saved: all_splits.json")

    # Models are already saved to SAVE_DIR during training
    model_files = [f for f in os.listdir(SAVE_DIR) if f.endswith(".pt")]
    if model_files:
        print(f"  Models already in Drive: {len(model_files)} files")
        for mf in model_files[:5]:
            print(f"    - {mf}")
        if len(model_files) > 5:
            print(f"    ... and {len(model_files)-5} more")

    print(f"\nResults persisted to Google Drive: {SAVE_DIR}")
else:
    print("WARNING: SAVE_DIR not available, results only in local directories")

# Optional: Create download zip
if CREATE_DOWNLOAD_ZIP:
    import zipfile

    zip_path = "BIP_v10.10_results.zip"
    print(f"\n" + "-" * 60)
    print("Creating download package...")

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        # Results
        if os.path.exists("results/final_results.json"):
            zf.write("results/final_results.json")

        # Models (from Drive)
        if SAVE_DIR and os.path.exists(SAVE_DIR):
            for f in os.listdir(SAVE_DIR):
                if f.endswith(".pt"):
                    zf.write(f"{SAVE_DIR}/{f}", f"models/{f}")

        # Config
        if os.path.exists("data/splits/all_splits.json"):
            zf.write("data/splits/all_splits.json")

    print(f"Download package ready: {zip_path}")

    # Download in Colab, or show path otherwise
    try:
        from google.colab import files
        files.download(zip_path)
    except ImportError:
        print(f"Not running in Colab. Zip saved to: {os.path.abspath(zip_path)}")
else:
    print(f"\n(Zip download disabled - set CREATE_DOWNLOAD_ZIP=True in cell 1 to enable)")

print("\n" + "=" * 60)
print("COMPLETE")
print("=" * 60)
