# BIP Cross-Temporal & Cross-Lingual Morality Experiment (v8)

**Testing the Bond Invariance Principle across 2500+ years and 6 languages**

This experiment tests whether moral cognition has invariant structure by training and testing across:

| Corpus | Language(s) | Time Period | Passages |
|--------|-------------|-------------|----------|
| **Sefaria** | Hebrew, Aramaic, Judeo-Arabic | ~1000 BCE - 1900 CE | ~200K+ |
| **Chinese Classics** | Classical Chinese | ~500 BCE - 200 CE | ~15K |
| **Islamic Texts** | Arabic | ~600 - 1300 CE | ~20K |
| **Dear Abby** | English | 1956 - 2020 | ~50K |

**Hypothesis**: If BIP holds, bond-level features should transfer across millennia AND languages with minimal degradation.

---

## v8 Changes
- **6 languages**: Hebrew, Aramaic, Judeo-Arabic, Classical Chinese, Arabic, English
- **New corpora**: Chinese Text Project classics, Quran + Hadith
- **Language-based splits**: Train on one language family, test on another
- **Full Sefaria**: All categories with proper language detection
- **L4 optimized**: Large batch sizes, full corpus processing

---

## Methodological Notes

**Label Source**: Bond/Hohfeld labels extracted from English translations for all corpora. This tests whether diverse source texts encode moral structures alignable via translation.

**Encoder**: `paraphrase-multilingual-MiniLM-L12-v2` - trained on 50+ languages including all target languages.

---

In [None]:
#@title 1. Setup and Install Dependencies { display-mode: "form" }
#@markdown Installs packages and detects GPU. Tuned for L4 runtime.

import time
EXPERIMENT_START = time.time()

print("=" * 60)
print("BIP MULTILINGUAL EXPERIMENT (v8)")
print("Cross-Temporal & Cross-Lingual Moral Transfer")
print("=" * 60)
print()

# Progress tracker
TASKS = [
    "Install dependencies",
    "Download Sefaria corpus (Hebrew/Aramaic)",
    "Download Chinese classics",
    "Download Islamic texts (Quran + Hadith)",
    "Download Dear Abby (English)",
    "Preprocess all corpora",
    "Extract bond structures",
    "Generate splits (temporal + lingual)",
    "Train BIP model",
    "Linear probe test",
    "Evaluate results"
]
task_status = {task: "pending" for task in TASKS}
task_times = {}
task_start_time = None

def print_progress():
    print()
    print("-" * 50)
    print("EXPERIMENT PROGRESS:")
    print("-" * 50)
    for task in TASKS:
        status = task_status[task]
        if status == "done":
            mark = "[X]"
            time_str = f" ({task_times.get(task, 0):.1f}s)" if task in task_times else ""
        elif status == "running":
            mark = "[>]"
            time_str = ""
        else:
            mark = "[ ]"
            time_str = ""
        print(f"  {mark} {task}{time_str}")
    elapsed = time.time() - EXPERIMENT_START
    print("-" * 50)
    print(f"  Total elapsed: {elapsed/60:.1f} minutes")
    print(flush=True)

def mark_task(task, status):
    global task_start_time
    if status == "running":
        task_start_time = time.time()
    elif status == "done" and task_start_time is not None:
        task_times[task] = time.time() - task_start_time
    task_status[task] = status
    print_progress()

print_progress()

mark_task("Install dependencies", "running")

import os
import subprocess
import sys

# Install dependencies
print("Installing dependencies...")
deps = [
    "transformers",
    "torch", 
    "sentence-transformers",
    "pandas",
    "tqdm",
    "psutil",
    "scikit-learn",
    "requests",
    "beautifulsoup4",
    "lxml"
]

for dep in deps:
    print(f"  Installing {dep}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", dep])

print()

# Detect accelerator
USE_TPU = False
TPU_TYPE = None

if 'COLAB_TPU_ADDR' in os.environ:
    USE_TPU = True
    TPU_TYPE = "TPU (Colab)"
    print("TPU detected!")

import torch
import json
import psutil
import shutil

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    ACCELERATOR = f"GPU: {gpu_name} ({gpu_mem:.1f}GB)"
    device = torch.device("cuda")
    
    IS_L4 = 'L4' in gpu_name
    IS_A100 = 'A100' in gpu_name
    IS_V100 = 'V100' in gpu_name
elif USE_TPU:
    ACCELERATOR = TPU_TYPE
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    IS_L4, IS_A100, IS_V100 = False, False, False
else:
    ACCELERATOR = "CPU (slow!)"
    device = torch.device("cpu")
    IS_L4, IS_A100, IS_V100 = False, False, False

print(f"Accelerator: {ACCELERATOR}")
print(f"Device: {device}")

# System resources
mem = psutil.virtual_memory()
disk = shutil.disk_usage('/')
print(f"System RAM: {mem.used/1e9:.1f}/{mem.total/1e9:.1f} GB ({mem.percent}%)")
print(f"Disk: {disk.used/1e9:.1f}/{disk.total/1e9:.1f} GB ({100*disk.used/disk.total:.1f}%)")

if torch.cuda.is_available():
    gpu_used = torch.cuda.memory_allocated()/1e9
    gpu_total = torch.cuda.get_device_properties(0).total_memory/1e9
    print(f"GPU RAM: {gpu_used:.1f}/{gpu_total:.1f} GB")

# Batch sizes based on GPU
if IS_L4 or IS_A100:
    BASE_BATCH_SIZE = 512
    print(f"\n*** L4/A100: batch_size={BASE_BATCH_SIZE} ***")
elif IS_V100:
    BASE_BATCH_SIZE = 384
else:
    BASE_BATCH_SIZE = 256

# Mixed precision
if torch.cuda.is_available():
    print("Enabling mixed precision (FP16)...")
    from torch.cuda.amp import autocast, GradScaler
    USE_AMP = True
    scaler = GradScaler()
else:
    USE_AMP = False
    scaler = None

# Google Drive mount
print()
print("=" * 60)
print("MOUNTING GOOGLE DRIVE")
print("=" * 60)
from google.colab import drive
drive.mount('/content/drive')
SAVE_DIR = '/content/drive/MyDrive/BIP_multilingual_results'
os.makedirs(SAVE_DIR, exist_ok=True)

for d in ["data/processed", "data/splits", "data/raw", "models/checkpoints", "results"]:
    os.makedirs(d, exist_ok=True)

def print_resources(label=""):
    mem = psutil.virtual_memory()
    disk = shutil.disk_usage('/')
    msg = f"[{label}] " if label else ""
    msg += f"RAM: {mem.used/1e9:.1f}/{mem.total/1e9:.1f}GB"
    if torch.cuda.is_available():
        gpu_used = torch.cuda.memory_allocated()/1e9
        gpu_total = torch.cuda.get_device_properties(0).total_memory/1e9
        msg += f" | GPU: {gpu_used:.1f}/{gpu_total:.1f}GB"
    msg += f" | Disk: {disk.used/1e9:.0f}/{disk.total/1e9:.0f}GB"
    print(msg)

print_resources("Setup complete")

mark_task("Install dependencies", "done")


In [None]:
#@title 2. Download Sefaria Corpus (Hebrew/Aramaic/Judeo-Arabic) { display-mode: "form" }
#@markdown Full Sefaria corpus with language detection.

import subprocess
import os

mark_task("Download Sefaria corpus (Hebrew/Aramaic)", "running")

sefaria_path = 'data/raw/Sefaria-Export'

if not os.path.exists(sefaria_path) or not os.path.exists(f"{sefaria_path}/json"):
    print("="*60)
    print("CLONING SEFARIA CORPUS")
    print("="*60)
    print("This downloads ~3.5GB and takes 5-15 minutes.")
    print("-"*60)
    
    process = subprocess.Popen(
        ['git', 'clone', '--depth', '1', '--progress',
         'https://github.com/Sefaria/Sefaria-Export.git', sefaria_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1
    )
    
    for line in process.stdout:
        print(line, end='', flush=True)
    process.wait()
    print("-"*60)
else:
    print("Sefaria already exists, skipping download.")

print("\nVerifying...")
!du -sh {sefaria_path} 2>/dev/null || echo "Not found"
json_count = !find {sefaria_path}/json -name "*.json" 2>/dev/null | wc -l
print(f"JSON files: {json_count[0]}")

print_resources("After Sefaria")

mark_task("Download Sefaria corpus (Hebrew/Aramaic)", "done")


In [None]:
#@title 3. Download Chinese Classics (Confucian/Daoist) { display-mode: "form" }
#@markdown Downloads classical Chinese texts with English translations.
#@markdown Sources: Analerta, Dao De Jing, Mencius, etc.

import os
import json
import requests
from pathlib import Path

mark_task("Download Chinese classics", "running")

chinese_dir = Path('data/raw/chinese_classics')
chinese_dir.mkdir(parents=True, exist_ok=True)

print("="*60)
print("DOWNLOADING CHINESE CLASSICS")
print("="*60)
print()

# We'll create a curated dataset of classical Chinese texts with translations
# Using publicly available bilingual editions

CHINESE_TEXTS = {
    'analects': {
        'title': 'Analects of Confucius (論語)',
        'period': 'CONFUCIAN',
        'century': -5,
        'url': 'https://raw.githubusercontent.com/ctext-org/ctext-data/master/analects.json'
    },
    'daodejing': {
        'title': 'Dao De Jing (道德經)',
        'period': 'DAOIST', 
        'century': -6,
        'url': 'https://raw.githubusercontent.com/ctext-org/ctext-data/master/daodejing.json'
    },
    'mencius': {
        'title': 'Mencius (孟子)',
        'period': 'CONFUCIAN',
        'century': -4,
        'url': 'https://raw.githubusercontent.com/ctext-org/ctext-data/master/mencius.json'
    },
    'xunzi': {
        'title': 'Xunzi (荀子)',
        'period': 'CONFUCIAN',
        'century': -3,
        'url': 'https://raw.githubusercontent.com/ctext-org/ctext-data/master/xunzi.json'
    },
    'zhuangzi': {
        'title': 'Zhuangzi (莊子)',
        'period': 'DAOIST',
        'century': -4,
        'url': 'https://raw.githubusercontent.com/ctext-org/ctext-data/master/zhuangzi.json'
    }
}

# Fallback: Create synthetic bilingual corpus from available sources
# Since ctext-org data might not be directly available, we'll use a backup approach

def download_chinese_texts():
    """Download or generate Chinese classics corpus."""
    passages = []
    
    # Try to download from ctext-org first
    for text_id, info in CHINESE_TEXTS.items():
        print(f"Processing {info['title']}...")
        
        # Try GitHub raw content
        try:
            resp = requests.get(info['url'], timeout=30)
            if resp.status_code == 200:
                data = resp.json()
                # Process ctext format
                for i, entry in enumerate(data.get('passages', data if isinstance(data, list) else [])):
                    if isinstance(entry, dict):
                        chinese = entry.get('chinese', entry.get('text', ''))
                        english = entry.get('english', entry.get('translation', ''))
                    else:
                        continue
                    
                    if chinese and english and len(english) >= 30:
                        passages.append({
                            'id': f'chinese_{text_id}_{i}',
                            'text_original': chinese,
                            'text_english': english,
                            'source': info['title'],
                            'period': info['period'],
                            'century': info['century']
                        })
                print(f"  Downloaded {len([p for p in passages if text_id in p['id']])} passages")
                continue
        except Exception as e:
            print(f"  Direct download failed: {e}")
        
        # Fallback: Use embedded sample data for core texts
        print(f"  Using embedded samples...")
    
    # If downloads failed, use embedded classical Chinese samples
    if len(passages) < 100:
        print("\nUsing embedded Chinese classics dataset...")
        passages = generate_embedded_chinese_corpus()
    
    return passages

def generate_embedded_chinese_corpus():
    """Generate corpus from well-known passages with translations."""
    # Core Analects passages (Confucius)
    analects = [
        ("學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？",
         "Is it not pleasant to learn with a constant perseverance and application? Is it not delightful to have friends coming from distant quarters? Is he not a man of complete virtue, who feels no discomposure though men may take no note of him?"),
        ("其為人也孝弟，而好犯上者，鮮矣；不好犯上，而好作亂者，未之有也。君子務本，本立而道生。孝弟也者，其為仁之本與！",
         "They are few who, being filial and fraternal, are fond of offending against their superiors. There have been none, who, not liking to offend against their superiors, have been fond of stirring up confusion. The superior man bends his attention to what is radical. That being established, all practical courses naturally grow up. Filial piety and fraternal submission are the root of all benevolent actions."),
        ("巧言令色，鮮矣仁！",
         "Fine words and an insinuating appearance are seldom associated with true virtue."),
        ("吾日三省吾身：為人謀而不忠乎？與朋友交而不信乎？傳不習乎？",
         "I daily examine myself on three points: whether in transacting business for others I have been faithful; whether in intercourse with friends I have been sincere; whether I have mastered and practiced the instructions of my teacher."),
        ("弟子入則孝，出則弟，謹而信，汎愛眾，而親仁。行有餘力，則以學文。",
         "A youth, when at home, should be filial, and, abroad, respectful to his elders. He should be earnest and truthful. He should overflow in love to all, and cultivate the friendship of the good. When he has time and opportunity, after the performance of these things, he should employ them in polite studies."),
        ("君子不重則不威，學則不固。主忠信，無友不如己者，過則勿憚改。",
         "If the scholar be not grave, he will not call forth any veneration, and his learning will not be solid. Hold faithfulness and sincerity as first principles. Have no friends not equal to yourself. When you have faults, do not fear to abandon them."),
        ("父在，觀其志；父沒，觀其行；三年無改於父之道，可謂孝矣。",
         "While a man's father is alive, look at the bent of his will; when his father is dead, look at his conduct. If for three years he does not alter from the way of his father, he may be called filial."),
        ("禮之用，和為貴。先王之道，斯為美；小大由之。有所不行，知和而和，不以禮節之，亦不可行也。",
         "In practicing the rules of propriety, a natural ease is to be prized. In the ways prescribed by the ancient kings, this is the excellent quality, and in things small and great we follow them. Yet it is not to be observed in all cases. If one, knowing how such ease should be prized, manifests it without regulating it by the rules of propriety, this likewise is not to be done."),
        ("信近於義，言可復也。恭近於禮，遠恥辱也。因不失其親，亦可宗也。",
         "When agreements are made according to what is right, what is spoken can be made good. When respect is shown according to what is proper, one keeps far from shame and disgrace. When the parties upon whom a man leans are proper persons to be intimate with, he can make them his guides and masters."),
        ("君子食無求飽，居無求安，敏於事而慎於言，就有道而正焉，可謂好學也已。",
         "The superior man, in the world, does not set his mind either for anything, or against anything; what is right he will follow. The superior man thinks of virtue; the small man thinks of comfort."),
    ]
    
    # Dao De Jing passages (Laozi)
    daodejing = [
        ("道可道，非常道。名可名，非常名。無名天地之始；有名萬物之母。",
         "The Tao that can be told is not the eternal Tao. The name that can be named is not the eternal name. The nameless is the beginning of heaven and earth. The named is the mother of ten thousand things."),
        ("天下皆知美之為美，斯惡已。皆知善之為善，斯不善已。",
         "When people see some things as beautiful, other things become ugly. When people see some things as good, other things become bad."),
        ("不尚賢，使民不爭；不貴難得之貨，使民不為盜；不見可欲，使民心不亂。",
         "Not exalting the gifted prevents quarreling. Not collecting treasures prevents stealing. Not seeing desirable things prevents confusion of the heart."),
        ("道沖而用之或不盈。淵兮似萬物之宗。",
         "The Tao is an empty vessel; it is used, but never filled. Oh, unfathomable source of ten thousand things!"),
        ("天地不仁，以萬物為芻狗；聖人不仁，以百姓為芻狗。",
         "Heaven and Earth are impartial; they see the ten thousand things as straw dogs. The wise are impartial; they see the people as straw dogs."),
        ("上善若水。水善利萬物而不爭，處眾人之所惡，故幾於道。",
         "The highest good is like water. Water gives life to the ten thousand things and does not strive. It flows in places men reject and so is like the Tao."),
        ("持而盈之，不如其已；揣而銳之，不可長保。金玉滿堂，莫之能守；富貴而驕，自遺其咎。功成身退，天之道。",
         "Fill your bowl to the brim and it will spill. Keep sharpening your knife and it will blunt. Chase after money and security and your heart will never unclench. Care about people's approval and you will be their prisoner. Do your work, then step back. The only path to serenity."),
        ("三十輻共一轂，當其無，有車之用。埏埴以為器，當其無，有器之用。鑿戶牖以為室，當其無，有室之用。故有之以為利，無之以為用。",
         "Thirty spokes share the wheel's hub; it is the center hole that makes it useful. Shape clay into a vessel; it is the space within that makes it useful. Cut doors and windows for a room; it is the holes which make it useful. Therefore benefit comes from what is there; usefulness from what is not there."),
        ("五色令人目盲；五音令人耳聾；五味令人口爽；馳騁畋獵，令人心發狂；難得之貨，令人行妨。",
         "Colors blind the eye. Sounds deafen the ear. Flavors numb the taste. Thoughts weaken the mind. Desires wither the heart."),
        ("大道廢，有仁義；智慧出，有大偽；六親不和，有孝慈；國家昏亂，有忠臣。",
         "When the great Tao is forgotten, goodness and piety appear. When the body's intelligence declines, cleverness and knowledge step forth. When there is no peace in the family, filial piety begins. When the country falls into chaos, patriotism is born."),
    ]
    
    # Mencius passages
    mencius = [
        ("孟子見梁惠王。王曰：叟不遠千里而來，亦將有以利吾國乎？孟子對曰：王何必曰利？亦有仁義而已矣。",
         "Mencius went to see king Hui of Liang. The king said, 'Venerable sir, since you have not counted it far to come here, a distance of a thousand li, may I presume that you are provided with counsels to profit my kingdom?' Mencius replied, 'Why must your Majesty use that word profit? What I am provided with, are counsels to benevolence and righteousness, and these are my only topics.'"),
        ("人皆有不忍人之心。先王有不忍人之心，斯有不忍人之政矣。",
         "All men have a mind which cannot bear to see the sufferings of others. The ancient kings had this commiserating mind, and they had likewise a commiserating government."),
        ("惻隱之心，仁之端也；羞惡之心，義之端也；辭讓之心，禮之端也；是非之心，智之端也。",
         "The feeling of commiseration is the principle of benevolence. The feeling of shame and dislike is the principle of righteousness. The feeling of modesty and complaisance is the principle of propriety. The feeling of approving and disapproving is the principle of knowledge."),
        ("人之所以異於禽獸者幾希，庶民去之，君子存之。",
         "That whereby man differs from the lower animals is but small. The mass of people cast it away, while superior men preserve it."),
        ("得道者多助，失道者寡助。寡助之至，親戚畔之；多助之至，天下順之。",
         "When one by his dao commands much support, the whole kingdom will submit to him. When one by his dao commands little support, even his relatives will turn away from him."),
    ]
    
    # Zhuangzi passages
    zhuangzi = [
        ("北冥有魚，其名為鯤。鯤之大，不知其幾千里也。化而為鳥，其名為鵬。鵬之背，不知其幾千里也。",
         "In the northern darkness there is a fish and his name is Kun. The Kun is so huge I don't know how many thousand li he measures. He changes and becomes a bird whose name is Peng. The back of the Peng measures I don't know how many thousand li across."),
        ("昔者莊周夢為胡蝶，栩栩然胡蝶也，自喻適志與！不知周也。俄然覺，則蘧蘧然周也。不知周之夢為胡蝶與，胡蝶之夢為周與？",
         "Once Zhuang Zhou dreamed he was a butterfly, a butterfly flitting and fluttering around, happy with himself and doing as he pleased. He didn't know he was Zhuang Zhou. Suddenly he woke up and there he was, solid and unmistakable Zhuang Zhou. But he didn't know if he was Zhuang Zhou who had dreamed he was a butterfly, or a butterfly dreaming he was Zhuang Zhou."),
        ("吾生也有涯，而知也無涯。以有涯隨無涯，殆已！",
         "Your life has a limit but knowledge has none. If you use what is limited to pursue what has no limit, you will be in danger."),
        ("泉涸，魚相與處於陸，相呴以濕，相濡以沫，不如相忘於江湖。",
         "When the springs dry up and the fish are left stranded on the ground, they spew each other with moisture and wet each other down with spit - but it would be much better if they could forget each other in the rivers and lakes."),
        ("人皆知有用之用，而莫知無用之用也。",
         "Everyone knows the usefulness of what is useful, but few know the usefulness of what is useless."),
    ]
    
    passages = []
    
    for i, (chinese, english) in enumerate(analects):
        passages.append({
            'id': f'chinese_analects_{i}',
            'text_original': chinese,
            'text_english': english,
            'source': 'Analects of Confucius (論語)',
            'period': 'CONFUCIAN',
            'century': -5
        })
    
    for i, (chinese, english) in enumerate(daodejing):
        passages.append({
            'id': f'chinese_daodejing_{i}',
            'text_original': chinese,
            'text_english': english,
            'source': 'Dao De Jing (道德經)',
            'period': 'DAOIST',
            'century': -6
        })
    
    for i, (chinese, english) in enumerate(mencius):
        passages.append({
            'id': f'chinese_mencius_{i}',
            'text_original': chinese,
            'text_english': english,
            'source': 'Mencius (孟子)',
            'period': 'CONFUCIAN',
            'century': -4
        })
    
    for i, (chinese, english) in enumerate(zhuangzi):
        passages.append({
            'id': f'chinese_zhuangzi_{i}',
            'text_original': chinese,
            'text_english': english,
            'source': 'Zhuangzi (莊子)',
            'period': 'DAOIST',
            'century': -4
        })
    
    return passages

# Download/generate Chinese corpus
chinese_passages = download_chinese_texts()

print(f"\nTotal Chinese passages: {len(chinese_passages)}")

# Save to file
with open(chinese_dir / 'chinese_classics.json', 'w', encoding='utf-8') as f:
    json.dump(chinese_passages, f, ensure_ascii=False, indent=2)

print(f"Saved to {chinese_dir / 'chinese_classics.json'}")

print_resources("After Chinese")

mark_task("Download Chinese classics", "done")


In [None]:
#@title 4. Download Islamic Texts (Quran + Hadith) { display-mode: "form" }
#@markdown Downloads Quran and major Hadith collections with English translations.

import os
import json
import requests
from pathlib import Path
from tqdm.auto import tqdm

mark_task("Download Islamic texts (Quran + Hadith)", "running")

islamic_dir = Path('data/raw/islamic_texts')
islamic_dir.mkdir(parents=True, exist_ok=True)

print("="*60)
print("DOWNLOADING ISLAMIC TEXTS")
print("="*60)
print()

def download_quran():
    """Download Quran with English translation from tanzil.net or similar."""
    passages = []
    
    # Try quran.com API or tanzil.net
    print("Downloading Quran...")
    
    # Using the Quran.com API v4
    try:
        # Get all surahs
        for surah_num in tqdm(range(1, 115), desc="Surahs"):
            # Arabic text
            arabic_url = f"https://api.quran.com/api/v4/quran/verses/uthmani?chapter_number={surah_num}"
            # English translation (Sahih International)
            english_url = f"https://api.quran.com/api/v4/quran/translations/131?chapter_number={surah_num}"
            
            try:
                arabic_resp = requests.get(arabic_url, timeout=30)
                english_resp = requests.get(english_url, timeout=30)
                
                if arabic_resp.status_code == 200 and english_resp.status_code == 200:
                    arabic_data = arabic_resp.json()
                    english_data = english_resp.json()
                    
                    arabic_verses = {v['verse_key']: v['text_uthmani'] for v in arabic_data.get('verses', [])}
                    english_verses = {t['verse_key']: t['text'] for t in english_data.get('translations', [])}
                    
                    for verse_key in arabic_verses:
                        if verse_key in english_verses:
                            arabic = arabic_verses[verse_key]
                            english = english_verses[verse_key]
                            # Clean HTML from translation
                            english = english.replace('<sup', ' <sup').replace('</sup>', '')
                            import re
                            english = re.sub(r'<[^>]+>', '', english).strip()
                            
                            if len(english) >= 20:
                                passages.append({
                                    'id': f'quran_{verse_key.replace(":", "_")}',
                                    'text_original': arabic,
                                    'text_english': english,
                                    'source': f'Quran {verse_key}',
                                    'period': 'QURANIC',
                                    'century': 7
                                })
            except Exception as e:
                continue
            
            # Rate limiting
            import time
            time.sleep(0.1)
    
    except Exception as e:
        print(f"Quran API failed: {e}")
    
    print(f"  Downloaded {len(passages)} Quran verses")
    return passages

def download_hadith():
    """Download Hadith from sunnah.com API."""
    passages = []
    
    print("\nDownloading Hadith collections...")
    
    # Major collections to download
    collections = [
        ('bukhari', 'Sahih Bukhari', 97),  # ~97 books
        ('muslim', 'Sahih Muslim', 56),    # ~56 books  
        ('abudawud', 'Sunan Abu Dawud', 43),
        ('tirmidhi', 'Jami at-Tirmidhi', 49),
    ]
    
    for collection_name, collection_title, n_books in collections:
        print(f"  {collection_title}...")
        collection_passages = []
        
        # Try sunnah.com API
        try:
            for book_num in range(1, min(n_books + 1, 20)):  # Limit to first 20 books per collection
                url = f"https://api.sunnah.com/v1/collections/{collection_name}/books/{book_num}/hadiths"
                headers = {'X-API-Key': 'SqD712P3E82xnwOAEOkGd5JZH8s9wRR24TqNFzjk'}  # Public demo key
                
                try:
                    resp = requests.get(url, headers=headers, timeout=30)
                    if resp.status_code == 200:
                        data = resp.json()
                        for hadith in data.get('data', []):
                            arabic = hadith.get('arabicText', '')
                            english = hadith.get('englishText', '')
                            
                            if arabic and english and len(english) >= 50:
                                collection_passages.append({
                                    'id': f'hadith_{collection_name}_{hadith.get("hadithNumber", len(collection_passages))}',
                                    'text_original': arabic,
                                    'text_english': english,
                                    'source': f'{collection_title} {hadith.get("hadithNumber", "")}',
                                    'period': 'HADITH',
                                    'century': 9  # Most compiled ~9th century
                                })
                except:
                    continue
                
                import time
                time.sleep(0.2)
        
        except Exception as e:
            print(f"    API failed: {e}")
        
        passages.extend(collection_passages)
        print(f"    Got {len(collection_passages)} hadith")
    
    return passages

def generate_embedded_islamic_corpus():
    """Fallback: embedded Islamic text samples."""
    passages = []
    
    # Key Quranic verses on moral themes
    quran_samples = [
        ("بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ", "In the name of Allah, the Entirely Merciful, the Especially Merciful.", "1:1"),
        ("وَقَضَىٰ رَبُّكَ أَلَّا تَعْبُدُوا إِلَّا إِيَّاهُ وَبِالْوَالِدَيْنِ إِحْسَانًا", "And your Lord has decreed that you not worship except Him, and to parents, good treatment.", "17:23"),
        ("وَلَا تَقْتُلُوا النَّفْسَ الَّتِي حَرَّمَ اللَّهُ إِلَّا بِالْحَقِّ", "And do not kill the soul which Allah has forbidden, except by right.", "17:33"),
        ("وَأَوْفُوا بِالْعَهْدِ إِنَّ الْعَهْدَ كَانَ مَسْئُولًا", "And fulfill every commitment. Indeed, the commitment is ever that about which one will be questioned.", "17:34"),
        ("وَلَا تَقْرَبُوا مَالَ الْيَتِيمِ إِلَّا بِالَّتِي هِيَ أَحْسَنُ", "And do not approach the property of an orphan, except in the way that is best.", "17:34"),
        ("إِنَّ اللَّهَ يَأْمُرُ بِالْعَدْلِ وَالْإِحْسَانِ وَإِيتَاءِ ذِي الْقُرْبَىٰ", "Indeed, Allah orders justice and good conduct and giving to relatives.", "16:90"),
        ("يَا أَيُّهَا الَّذِينَ آمَنُوا كُونُوا قَوَّامِينَ بِالْقِسْطِ شُهَدَاءَ لِلَّهِ", "O you who have believed, be persistently standing firm in justice, witnesses for Allah.", "4:135"),
        ("وَتَعَاوَنُوا عَلَى الْبِرِّ وَالتَّقْوَىٰ وَلَا تَعَاوَنُوا عَلَى الْإِثْمِ وَالْعُدْوَانِ", "And cooperate in righteousness and piety, but do not cooperate in sin and aggression.", "5:2"),
        ("مَنْ قَتَلَ نَفْسًا بِغَيْرِ نَفْسٍ أَوْ فَسَادٍ فِي الْأَرْضِ فَكَأَنَّمَا قَتَلَ النَّاسَ جَمِيعًا", "Whoever kills a soul unless for a soul or for corruption done in the land - it is as if he had slain mankind entirely.", "5:32"),
        ("وَمَنْ أَحْيَاهَا فَكَأَنَّمَا أَحْيَا النَّاسَ جَمِيعًا", "And whoever saves one - it is as if he had saved mankind entirely.", "5:32"),
    ]
    
    for arabic, english, ref in quran_samples:
        passages.append({
            'id': f'quran_{ref.replace(":", "_")}',
            'text_original': arabic,
            'text_english': english,
            'source': f'Quran {ref}',
            'period': 'QURANIC',
            'century': 7
        })
    
    # Key Hadith on moral themes
    hadith_samples = [
        ("إنما الأعمال بالنيات وإنما لكل امرئ ما نوى", "Actions are judged by intentions, and everyone will be rewarded according to what he intended.", "Bukhari 1"),
        ("لا يؤمن أحدكم حتى يحب لأخيه ما يحب لنفسه", "None of you truly believes until he loves for his brother what he loves for himself.", "Bukhari 13"),
        ("من كان يؤمن بالله واليوم الآخر فليقل خيرا أو ليصمت", "Whoever believes in Allah and the Last Day should speak good or remain silent.", "Bukhari 6018"),
        ("المسلم من سلم المسلمون من لسانه ويده", "A Muslim is one from whose tongue and hand other Muslims are safe.", "Bukhari 10"),
        ("لا ضرر ولا ضرار", "There should be neither harm nor reciprocal harm.", "Ibn Majah 2341"),
        ("ارحموا من في الأرض يرحمكم من في السماء", "Be merciful to those on earth and the One in the heavens will be merciful to you.", "Tirmidhi 1924"),
        ("الدين النصيحة", "The religion is sincerity and sincere advice.", "Muslim 55"),
        ("من رأى منكم منكرا فليغيره بيده", "Whoever among you sees an evil, let him change it with his hand.", "Muslim 49"),
        ("لا يحل مال امرئ مسلم إلا بطيب نفس منه", "It is not permissible to take the property of a Muslim except with his willing consent.", "Ahmad 20172"),
        ("كلكم راع وكلكم مسؤول عن رعيته", "Each of you is a shepherd and each of you is responsible for his flock.", "Bukhari 7138"),
    ]
    
    for arabic, english, ref in hadith_samples:
        passages.append({
            'id': f'hadith_{ref.replace(" ", "_").lower()}',
            'text_original': arabic,
            'text_english': english,
            'source': ref,
            'period': 'HADITH',
            'century': 9
        })
    
    return passages

# Try downloading, fall back to embedded
quran_passages = download_quran()
hadith_passages = download_hadith()

islamic_passages = quran_passages + hadith_passages

# If not enough, use embedded
if len(islamic_passages) < 50:
    print("\nUsing embedded Islamic text samples...")
    islamic_passages = generate_embedded_islamic_corpus()

print(f"\nTotal Islamic passages: {len(islamic_passages)}")
print(f"  Quranic: {len([p for p in islamic_passages if p['period'] == 'QURANIC'])}")
print(f"  Hadith: {len([p for p in islamic_passages if p['period'] == 'HADITH'])}")

# Save
with open(islamic_dir / 'islamic_texts.json', 'w', encoding='utf-8') as f:
    json.dump(islamic_passages, f, ensure_ascii=False, indent=2)

print(f"Saved to {islamic_dir / 'islamic_texts.json'}")

print_resources("After Islamic")

mark_task("Download Islamic texts (Quran + Hadith)", "done")


In [None]:
#@title 5. Download Dear Abby Dataset (English) { display-mode: "form" }
#@markdown Downloads the Dear Abby advice column dataset.

import subprocess
import os
import pandas as pd
from pathlib import Path

mark_task("Download Dear Abby (English)", "running")

sqnd_path = 'sqnd-probe-data'
if not os.path.exists(sqnd_path):
    print("Cloning sqnd-probe repo...")
    process = subprocess.Popen(
        ['git', 'clone', '--depth', '1', '--progress',
         'https://github.com/ahb-sjsu/sqnd-probe.git', sqnd_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1
    )
    for line in process.stdout:
        print(line, end='', flush=True)
    process.wait()
else:
    print("Repo already cloned.")

dear_abby_source = Path('sqnd-probe-data/dear_abby_data/raw_da_qs.csv')
dear_abby_path = Path('data/raw/dear_abby.csv')

if dear_abby_source.exists():
    !cp "{dear_abby_source}" "{dear_abby_path}"
    print(f"Copied Dear Abby data")
elif not dear_abby_path.exists():
    raise FileNotFoundError("Dear Abby dataset not found!")

df_check = pd.read_csv(dear_abby_path)
print(f"\nDear Abby: {len(df_check):,} entries")
print(f"Year range: {df_check['year'].min():.0f} - {df_check['year'].max():.0f}")

print_resources("After Dear Abby")

mark_task("Download Dear Abby (English)", "done")

In [None]:
#@title 6. Define Data Structures and Load All Corpora { display-mode: "form" }
#@markdown Unified data structures for multilingual corpus.

import json
import hashlib
import re
import os
import gc
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Set, Optional
from enum import Enum
from collections import defaultdict
from tqdm.auto import tqdm

mark_task("Preprocess all corpora", "running")

print("="*60)
print("DEFINING DATA STRUCTURES")
print("="*60)

class TimePeriod(Enum):
    # Jewish texts
    BIBLICAL = 0        # ~1000-500 BCE
    SECOND_TEMPLE = 1   # ~500 BCE - 70 CE  
    TANNAITIC = 2       # ~70-200 CE
    AMORAIC = 3         # ~200-500 CE
    GEONIC = 4          # ~600-1000 CE
    RISHONIM = 5        # ~1000-1500 CE
    ACHRONIM = 6        # ~1500-1800 CE
    MODERN_HEBREW = 7   # ~1800-present
    # Chinese texts
    CONFUCIAN = 8       # ~500-200 BCE
    DAOIST = 9          # ~600-200 BCE
    # Islamic texts
    QURANIC = 10        # ~610-632 CE
    HADITH = 11         # ~700-900 CE
    # Modern English
    DEAR_ABBY = 12      # 1956-2020

class Language(Enum):
    HEBREW = 0
    ARAMAIC = 1
    JUDEO_ARABIC = 2
    CLASSICAL_CHINESE = 3
    ARABIC = 4
    ENGLISH = 5

class BondType(Enum):
    HARM_PREVENTION = 0
    RECIPROCITY = 1
    AUTONOMY = 2
    PROPERTY = 3
    FAMILY = 4
    AUTHORITY = 5
    EMERGENCY = 6
    CONTRACT = 7
    CARE = 8
    FAIRNESS = 9
    NONE = 10

class HohfeldianState(Enum):
    RIGHT = 0
    OBLIGATION = 1
    LIBERTY = 2
    NO_RIGHT = 3

@dataclass
class Passage:
    id: str
    text_original: str
    text_english: str
    time_period: str
    century: int
    source: str
    source_type: str  # sefaria, chinese, islamic, dear_abby
    category: str
    language: str
    word_count: int = 0
    bond_types: List[str] = field(default_factory=list)
    
    def to_dict(self):
        return asdict(self)

# Language detection for Sefaria
def detect_sefaria_language(text: str, category: str) -> str:
    """Detect language of Sefaria text."""
    if not text:
        return 'hebrew'
    
    # Aramaic indicators (Talmud, Zohar)
    aramaic_categories = {'Talmud', 'Bavli', 'Yerushalmi', 'Zohar'}
    if category in aramaic_categories:
        return 'aramaic'
    
    # Check for Arabic script (Judeo-Arabic)
    arabic_chars = sum(1 for c in text if '\u0600' <= c <= '\u06FF')
    if arabic_chars > len(text) * 0.3:
        return 'judeo_arabic'
    
    return 'hebrew'

CATEGORY_TO_PERIOD = {
    'Tanakh': TimePeriod.BIBLICAL,
    'Torah': TimePeriod.BIBLICAL,
    'Prophets': TimePeriod.BIBLICAL,
    'Writings': TimePeriod.BIBLICAL,
    'Mishnah': TimePeriod.TANNAITIC,
    'Tosefta': TimePeriod.TANNAITIC,
    'Talmud': TimePeriod.AMORAIC,
    'Bavli': TimePeriod.AMORAIC,
    'Yerushalmi': TimePeriod.AMORAIC,
    'Midrash': TimePeriod.AMORAIC,
    'Halakhah': TimePeriod.RISHONIM,
    'Kabbalah': TimePeriod.RISHONIM,
    'Liturgy': TimePeriod.GEONIC,
    'Philosophy': TimePeriod.RISHONIM,
    'Chasidut': TimePeriod.ACHRONIM,
    'Musar': TimePeriod.ACHRONIM,
    'Responsa': TimePeriod.ACHRONIM,
    'Modern': TimePeriod.MODERN_HEBREW,
}

PERIOD_TO_CENTURY = {
    TimePeriod.BIBLICAL: -6,
    TimePeriod.SECOND_TEMPLE: -2,
    TimePeriod.TANNAITIC: 2,
    TimePeriod.AMORAIC: 4,
    TimePeriod.GEONIC: 8,
    TimePeriod.RISHONIM: 12,
    TimePeriod.ACHRONIM: 17,
    TimePeriod.MODERN_HEBREW: 20,
    TimePeriod.CONFUCIAN: -4,
    TimePeriod.DAOIST: -5,
    TimePeriod.QURANIC: 7,
    TimePeriod.HADITH: 9,
    TimePeriod.DEAR_ABBY: 20,
}

def load_sefaria_full(base_path: str) -> List[Passage]:
    """Load FULL Sefaria corpus with language detection."""
    passages = []
    json_path = Path(base_path) / "json"
    
    if not json_path.exists():
        print(f"Warning: {json_path} not found")
        return []
    
    json_files = list(json_path.rglob("*.json"))
    print(f"Found {len(json_files):,} JSON files")
    print("Processing ALL files...")
    
    for json_file in tqdm(json_files, desc="Sefaria", unit="file"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except:
            continue
        
        rel_path = json_file.relative_to(json_path)
        category = str(rel_path.parts[0]) if rel_path.parts else "unknown"
        time_period = CATEGORY_TO_PERIOD.get(category, TimePeriod.AMORAIC)
        century = PERIOD_TO_CENTURY.get(time_period, 0)
        
        if isinstance(data, dict):
            hebrew = data.get('he', data.get('text', []))
            english = data.get('text', data.get('en', []))
            
            def flatten(h, e, ref=""):
                if isinstance(h, str) and isinstance(e, str):
                    h_clean = re.sub(r'<[^>]+>', '', h).strip()
                    e_clean = re.sub(r'<[^>]+>', '', e).strip()
                    if 50 <= len(e_clean) <= 2000:
                        lang = detect_sefaria_language(h_clean, category)
                        pid = hashlib.md5(f"{json_file.stem}:{ref}:{h_clean[:50]}".encode()).hexdigest()[:12]
                        return [Passage(
                            id=f"sefaria_{pid}",
                            text_original=h_clean,
                            text_english=e_clean,
                            time_period=time_period.name,
                            century=century,
                            source=f"{json_file.stem} {ref}".strip(),
                            source_type="sefaria",
                            category=category,
                            language=lang,
                            word_count=len(e_clean.split())
                        )]
                    return []
                elif isinstance(h, list) and isinstance(e, list):
                    result = []
                    for i, (hh, ee) in enumerate(zip(h, e)):
                        result.extend(flatten(hh, ee, f"{ref}.{i+1}" if ref else str(i+1)))
                    return result
                return []
            
            passages.extend(flatten(hebrew, english))
    
    return passages

def load_chinese(path: str) -> List[Passage]:
    """Load Chinese classics."""
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    passages = []
    for item in data:
        passages.append(Passage(
            id=item['id'],
            text_original=item['text_original'],
            text_english=item['text_english'],
            time_period=item['period'],
            century=item['century'],
            source=item['source'],
            source_type='chinese',
            category=item['period'],
            language='classical_chinese',
            word_count=len(item['text_english'].split())
        ))
    return passages

def load_islamic(path: str) -> List[Passage]:
    """Load Islamic texts."""
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    passages = []
    for item in data:
        passages.append(Passage(
            id=item['id'],
            text_original=item['text_original'],
            text_english=item['text_english'],
            time_period=item['period'],
            century=item['century'],
            source=item['source'],
            source_type='islamic',
            category=item['period'],
            language='arabic',
            word_count=len(item['text_english'].split())
        ))
    return passages

def load_dear_abby(path: str, max_passages: int = None) -> List[Passage]:
    """Load Dear Abby corpus."""
    passages = []
    df = pd.read_csv(path)
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Dear Abby", unit="row"):
        question = str(row.get('question_only', ''))
        if not question or question == 'nan' or len(question) < 50 or len(question) > 2000:
            continue
        
        year = int(row.get('year', 1990))
        pid = hashlib.md5(f"abby:{idx}:{question[:50]}".encode()).hexdigest()[:12]
        
        passages.append(Passage(
            id=f"abby_{pid}",
            text_original=question,
            text_english=question,
            time_period=TimePeriod.DEAR_ABBY.name,
            century=20 if year < 2000 else 21,
            source=f"Dear Abby {year}",
            source_type="dear_abby",
            category="advice",
            language="english",
            word_count=len(question.split())
        ))
        
        if max_passages and len(passages) >= max_passages:
            break
    
    return passages

# Load all corpora
print()
print("="*60)
print("LOADING ALL CORPORA")
print("="*60)
print()

all_passages = []

# 1. Sefaria (full)
print("1. Loading Sefaria (full)...")
sefaria = load_sefaria_full("data/raw/Sefaria-Export")
print(f"   Loaded: {len(sefaria):,} passages")
all_passages.extend(sefaria)
del sefaria
gc.collect()

# Language breakdown for Sefaria
sefaria_langs = defaultdict(int)
for p in [p for p in all_passages if p.source_type == 'sefaria']:
    sefaria_langs[p.language] += 1
print(f"   Languages: {dict(sefaria_langs)}")

# 2. Chinese classics
print("\n2. Loading Chinese classics...")
chinese = load_chinese("data/raw/chinese_classics/chinese_classics.json")
print(f"   Loaded: {len(chinese):,} passages")
all_passages.extend(chinese)
del chinese

# 3. Islamic texts
print("\n3. Loading Islamic texts...")
islamic = load_islamic("data/raw/islamic_texts/islamic_texts.json")
print(f"   Loaded: {len(islamic):,} passages")
all_passages.extend(islamic)
del islamic

# 4. Dear Abby
print("\n4. Loading Dear Abby...")
abby = load_dear_abby("data/raw/dear_abby.csv")
print(f"   Loaded: {len(abby):,} passages")
all_passages.extend(abby)
del abby

gc.collect()

print()
print("="*60)
print(f"TOTAL PASSAGES: {len(all_passages):,}")
print("="*60)

# Summary by source
by_source = defaultdict(int)
by_language = defaultdict(int)
by_period = defaultdict(int)
for p in all_passages:
    by_source[p.source_type] += 1
    by_language[p.language] += 1
    by_period[p.time_period] += 1

print("\nBy source:")
for src, cnt in sorted(by_source.items(), key=lambda x: -x[1]):
    print(f"  {src:15s}: {cnt:>8,}")

print("\nBy language:")
for lang, cnt in sorted(by_language.items(), key=lambda x: -x[1]):
    print(f"  {lang:20s}: {cnt:>8,}")

print("\nBy time period:")
for period, cnt in sorted(by_period.items(), key=lambda x: -x[1])[:10]:
    print(f"  {period:20s}: {cnt:>8,}")

print_resources("After loading")

mark_task("Preprocess all corpora", "done")


In [None]:
#@title 7. Extract Bond Structures { display-mode: "form" }
#@markdown Labels extracted from English translations for all languages.

import gc
import json
import re
from collections import defaultdict
from tqdm.auto import tqdm

mark_task("Extract bond structures", "running")

print("="*60)
print("EXTRACTING BOND STRUCTURES")
print("="*60)
print()
print("Labels derived from English translations for all corpora.")
print()

RELATION_PATTERNS = {
    BondType.HARM_PREVENTION: [r'\b(kill|murder|harm|hurt|save|rescue|protect|danger|attack|injure|wound|destroy|blood|death|violence)\b'],
    BondType.RECIPROCITY: [r'\b(return|repay|owe|debt|mutual|exchange|give back|pay back|reciprocate|reward|recompense)\b'],
    BondType.AUTONOMY: [r'\b(choose|decision|consent|agree|force|coerce|right|freedom|liberty|self-determination|free will)\b'],
    BondType.PROPERTY: [r'\b(property|own|steal|theft|buy|sell|land|possess|belong|asset|wealth|money|inheritance)\b'],
    BondType.FAMILY: [r'\b(honor|parent|marry|divorce|inherit|family|mother|father|child|son|daughter|spouse|husband|wife|brother|sister|filial)\b'],
    BondType.AUTHORITY: [r'\b(obey|command|law|judge|rule|teach|leader|king|master|servant|subject|authority|govern|submit)\b'],
    BondType.CARE: [r'\b(care|help|assist|feed|clothe|visit|nurture|tend|support|comfort|mercy|compassion|kindness)\b'],
    BondType.FAIRNESS: [r'\b(fair|just|equal|deserve|bias|impartial|equity|discrimination|justice|righteous)\b'],
    BondType.EMERGENCY: [r'\b(emergency|urgent|crisis|danger|life-threatening|immediate|desperate|dire|peril|rescue)\b'],
    BondType.CONTRACT: [r'\b(contract|agreement|promise|vow|oath|covenant|pledge|commit|bind|treaty|negotiate|witness)\b'],
}

HOHFELD_PATTERNS = {
    HohfeldianState.OBLIGATION: [r'\b(must|shall|duty|require|should|ought|obligated|commanded)\b'],
    HohfeldianState.RIGHT: [r'\b(right to|entitled|deserve|claim|due)\b'],
    HohfeldianState.LIBERTY: [r'\b(may|can|permitted|allowed|free to|at liberty)\b'],
}

def extract_bond_structure(passage: Passage) -> Dict:
    text = passage.text_english.lower()
    
    relations = []
    for rel_type, patterns in RELATION_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                relations.append(rel_type.name)
                break
    
    if not relations:
        relations = ['NONE']
    
    hohfeld = None
    for state, patterns in HOHFELD_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                hohfeld = state.name
                break
        if hohfeld:
            break
    
    return {
        'bonds': [{'relation': r} for r in relations],
        'primary_relation': relations[0],
        'hohfeld_state': hohfeld,
        'signature': "|".join(sorted(set(relations)))
    }

print("Writing to disk...")

bond_counts = defaultdict(int)

with open("data/processed/passages.jsonl", 'w') as f_pass, \
     open("data/processed/bond_structures.jsonl", 'w') as f_bond:
    
    for passage in tqdm(all_passages, desc="Extracting", unit="passage"):
        bond_struct = extract_bond_structure(passage)
        passage.bond_types = [b['relation'] for b in bond_struct['bonds']]
        
        for bond in bond_struct['bonds']:
            bond_counts[bond['relation']] += 1
        
        f_pass.write(json.dumps(passage.to_dict()) + '\n')
        f_bond.write(json.dumps({
            'passage_id': passage.id,
            'bond_structure': bond_struct
        }) + '\n')

n_passages = len(all_passages)
del all_passages
gc.collect()

print(f"\nSaved {n_passages:,} passages")

print("\nBond distribution:")
for bond, cnt in sorted(bond_counts.items(), key=lambda x: -x[1]):
    pct = 100 * cnt / sum(bond_counts.values())
    print(f"  {bond:20s}: {cnt:>8,} ({pct:5.1f}%)")

print_resources("After extraction")

mark_task("Extract bond structures", "done")


In [None]:
#@title 8. Generate Splits (Temporal + Lingual) { display-mode: "form" }
#@markdown Creates both temporal and language-based splits.

import random
import gc
import json
import shutil
from collections import defaultdict
from tqdm.auto import tqdm

random.seed(42)

mark_task("Generate splits (temporal + lingual)", "running")

print("="*60)
print("GENERATING SPLITS")
print("="*60)
print()

# Read all passage metadata
print("Reading passage metadata...")
passage_meta = []
with open("data/processed/passages.jsonl", 'r') as f:
    for line in tqdm(f, desc="Reading", unit="line"):
        p = json.loads(line)
        passage_meta.append({
            'id': p['id'],
            'language': p['language'],
            'source_type': p['source_type'],
            'time_period': p['time_period'],
            'century': p['century']
        })

print(f"Total passages: {len(passage_meta):,}")

# Group by various attributes
by_language = defaultdict(list)
by_source = defaultdict(list)
by_period = defaultdict(list)

for p in passage_meta:
    by_language[p['language']].append(p['id'])
    by_source[p['source_type']].append(p['id'])
    by_period[p['time_period']].append(p['id'])

print("\nBy language:")
for lang, ids in sorted(by_language.items(), key=lambda x: -len(x[1])):
    print(f"  {lang:20s}: {len(ids):>8,}")

print("\nBy source:")
for src, ids in sorted(by_source.items(), key=lambda x: -len(x[1])):
    print(f"  {src:15s}: {len(ids):>8,}")

# ==========================================
# SPLIT 1: Temporal (Ancient -> Modern)
# ==========================================
print("\n" + "-"*60)
print("SPLIT 1: TEMPORAL (Ancient → Modern)")
print("-"*60)

ancient_periods = {'BIBLICAL', 'SECOND_TEMPLE', 'TANNAITIC', 'AMORAIC', 'GEONIC', 'RISHONIM',
                   'CONFUCIAN', 'DAOIST', 'QURANIC', 'HADITH'}
modern_periods = {'ACHRONIM', 'MODERN_HEBREW', 'DEAR_ABBY'}

ancient_ids = [p['id'] for p in passage_meta if p['time_period'] in ancient_periods]
modern_ids = [p['id'] for p in passage_meta if p['time_period'] in modern_periods]

random.shuffle(ancient_ids)
random.shuffle(modern_ids)

# Use 10% of modern for validation
n_valid = len(modern_ids) // 10

split_temporal = {
    'name': 'temporal_ancient_to_modern',
    'train_ids': ancient_ids,
    'valid_ids': modern_ids[:n_valid],
    'test_ids': modern_ids[n_valid:],
    'train_size': len(ancient_ids),
    'valid_size': n_valid,
    'test_size': len(modern_ids) - n_valid,
    'description': 'Train on ancient texts (all languages), test on modern'
}
print(f"  Train: {split_temporal['train_size']:,} | Valid: {split_temporal['valid_size']:,} | Test: {split_temporal['test_size']:,}")

# ==========================================
# SPLIT 2: Semitic -> English
# ==========================================
print("\n" + "-"*60)
print("SPLIT 2: SEMITIC → ENGLISH")
print("-"*60)

semitic_ids = by_language['hebrew'] + by_language['aramaic'] + by_language.get('judeo_arabic', []) + by_language['arabic']
english_ids = by_language['english']

random.shuffle(semitic_ids)
random.shuffle(english_ids)

n_valid = len(english_ids) // 10

split_semitic_to_english = {
    'name': 'semitic_to_english',
    'train_ids': semitic_ids,
    'valid_ids': english_ids[:n_valid],
    'test_ids': english_ids[n_valid:],
    'train_size': len(semitic_ids),
    'valid_size': n_valid,
    'test_size': len(english_ids) - n_valid,
    'description': 'Train on Hebrew/Aramaic/Arabic, test on English'
}
print(f"  Train: {split_semitic_to_english['train_size']:,} | Valid: {split_semitic_to_english['valid_size']:,} | Test: {split_semitic_to_english['test_size']:,}")

# ==========================================
# SPLIT 3: Chinese -> All Others
# ==========================================
print("\n" + "-"*60)
print("SPLIT 3: CHINESE → ALL OTHERS")
print("-"*60)

chinese_ids = by_language['classical_chinese']
other_ids = [p['id'] for p in passage_meta if p['language'] != 'classical_chinese']

random.shuffle(other_ids)
n_valid = min(len(chinese_ids) // 2, 500)
n_test = min(len(other_ids), 10000)

split_chinese_to_others = {
    'name': 'chinese_to_others',
    'train_ids': chinese_ids,
    'valid_ids': other_ids[:n_valid],
    'test_ids': other_ids[n_valid:n_valid+n_test],
    'train_size': len(chinese_ids),
    'valid_size': n_valid,
    'test_size': n_test,
    'description': 'Train on Classical Chinese, test on other languages'
}
print(f"  Train: {split_chinese_to_others['train_size']:,} | Valid: {split_chinese_to_others['valid_size']:,} | Test: {split_chinese_to_others['test_size']:,}")

# ==========================================
# SPLIT 4: Mixed (In-Domain Baseline)
# ==========================================
print("\n" + "-"*60)
print("SPLIT 4: MIXED (In-Domain Baseline)")
print("-"*60)

all_ids = [p['id'] for p in passage_meta]
random.shuffle(all_ids)

n = len(all_ids)
n_train = int(0.7 * n)
n_valid = int(0.15 * n)

split_mixed = {
    'name': 'mixed_baseline',
    'train_ids': all_ids[:n_train],
    'valid_ids': all_ids[n_train:n_train+n_valid],
    'test_ids': all_ids[n_train+n_valid:],
    'train_size': n_train,
    'valid_size': n_valid,
    'test_size': n - n_train - n_valid,
    'description': 'Random split across all corpora (in-domain baseline)'
}
print(f"  Train: {split_mixed['train_size']:,} | Valid: {split_mixed['valid_size']:,} | Test: {split_mixed['test_size']:,}")

# ==========================================
# SPLIT 5: Hebrew-only -> Others
# ==========================================
print("\n" + "-"*60)
print("SPLIT 5: HEBREW → NON-HEBREW")
print("-"*60)

hebrew_ids = by_language['hebrew']
non_hebrew_ids = [p['id'] for p in passage_meta if p['language'] != 'hebrew']

random.shuffle(hebrew_ids)
random.shuffle(non_hebrew_ids)

n_valid = min(len(non_hebrew_ids) // 10, 5000)
n_test = min(len(non_hebrew_ids) - n_valid, 20000)

split_hebrew_to_others = {
    'name': 'hebrew_to_others',
    'train_ids': hebrew_ids,
    'valid_ids': non_hebrew_ids[:n_valid],
    'test_ids': non_hebrew_ids[n_valid:n_valid+n_test],
    'train_size': len(hebrew_ids),
    'valid_size': n_valid,
    'test_size': n_test,
    'description': 'Train on Hebrew only, test on all other languages'
}
print(f"  Train: {split_hebrew_to_others['train_size']:,} | Valid: {split_hebrew_to_others['valid_size']:,} | Test: {split_hebrew_to_others['test_size']:,}")

# Save all splits
all_splits = {
    'temporal_ancient_to_modern': split_temporal,
    'semitic_to_english': split_semitic_to_english,
    'chinese_to_others': split_chinese_to_others,
    'hebrew_to_others': split_hebrew_to_others,
    'mixed_baseline': split_mixed,
}

with open("data/splits/all_splits.json", 'w') as f:
    json.dump(all_splits, f, indent=2)

# Compute baselines
print("\n" + "="*60)
print("COMPUTING BASELINES")
print("="*60)

bond_counts = defaultdict(int)
time_counts = defaultdict(int)
lang_counts = defaultdict(int)

with open("data/processed/passages.jsonl", 'r') as fp, \
     open("data/processed/bond_structures.jsonl", 'r') as fb:
    for p_line, b_line in zip(fp, fb):
        p = json.loads(p_line)
        b = json.loads(b_line)
        
        # ID integrity check
        assert b['passage_id'] == p['id'], f"ID mismatch: {b['passage_id']} != {p['id']}"
        
        bond_counts[b['bond_structure']['primary_relation']] += 1
        time_counts[p['time_period']] += 1
        lang_counts[p['language']] += 1

print("ID integrity check: PASSED ✓")

N_BOND = len(bond_counts)
N_TIME = len(time_counts)
N_LANG = len(lang_counts)

baselines = {
    'bond_counts': dict(bond_counts),
    'time_counts': dict(time_counts),
    'language_counts': dict(lang_counts),
    'chance_bond': 1.0 / N_BOND,
    'chance_time': 1.0 / N_TIME,
    'chance_language': 1.0 / N_LANG,
    'n_bond_classes': N_BOND,
    'n_time_classes': N_TIME,
    'n_language_classes': N_LANG,
}

with open("data/splits/baselines.json", 'w') as f:
    json.dump(baselines, f, indent=2)

print(f"\nChance baselines:")
print(f"  Bond:     {baselines['chance_bond']:.1%} ({N_BOND} classes)")
print(f"  Time:     {baselines['chance_time']:.1%} ({N_TIME} classes)")
print(f"  Language: {baselines['chance_language']:.1%} ({N_LANG} classes)")

# Save to Drive
print("\nSaving to Google Drive...")
shutil.copytree("data/processed", f"{SAVE_DIR}/processed", dirs_exist_ok=True)
shutil.copytree("data/splits", f"{SAVE_DIR}/splits", dirs_exist_ok=True)

del passage_meta
gc.collect()

print_resources("After splits")

mark_task("Generate splits (temporal + lingual)", "done")


In [None]:
#@title 9. Define Model Architecture { display-mode: "form" }
#@markdown BIP model with multilingual encoder.

import gc
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from tqdm.auto import tqdm

print("="*60)
print("MODEL ARCHITECTURE")
print("="*60)
print()

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

class GradientReversal(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, lambda_):
        ctx.lambda_ = lambda_
        return x.clone()
    @staticmethod
    def backward(ctx, grad_output):
        return -ctx.lambda_ * grad_output, None

def gradient_reversal(x, lambda_=1.0):
    return GradientReversal.apply(x, lambda_)

class BIPEncoder(nn.Module):
    def __init__(self, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.d_model = 384
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).float()
        return (hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)

class BIPModel(nn.Module):
    def __init__(self, d_model=384, d_bond=64, d_label=32, n_periods=13, n_languages=6, n_hohfeld=4, n_bonds=11):
        super().__init__()
        self.encoder = BIPEncoder()
        
        self.bond_proj = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model // 2, d_bond)
        )
        
        self.label_proj = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model // 2, d_label)
        )
        
        # Classifiers
        self.time_classifier = nn.Linear(d_bond, n_periods)
        self.language_classifier = nn.Linear(d_bond, n_languages)
        self.hohfeld_classifier = nn.Linear(d_bond, n_hohfeld)
        self.bond_classifier = nn.Linear(d_bond, n_bonds)
    
    def forward(self, input_ids, attention_mask, adversarial_lambda=1.0):
        h = self.encoder(input_ids, attention_mask)
        z_bond = self.bond_proj(h)
        z_label = self.label_proj(h)
        
        z_bond_adv = gradient_reversal(z_bond, adversarial_lambda)
        
        return {
            'z_bond': z_bond,
            'z_label': z_label,
            'time_pred': self.time_classifier(z_bond_adv),
            'language_pred': self.language_classifier(z_bond_adv),
            'hohfeld_pred': self.hohfeld_classifier(z_bond),
            'bond_pred': self.bond_classifier(z_bond)
        }
    
    def extract_z_bond(self, input_ids, attention_mask):
        with torch.no_grad():
            h = self.encoder(input_ids, attention_mask)
            return self.bond_proj(h)

# Index mappings
TIME_PERIOD_TO_IDX = {
    'BIBLICAL': 0, 'SECOND_TEMPLE': 1, 'TANNAITIC': 2, 'AMORAIC': 3,
    'GEONIC': 4, 'RISHONIM': 5, 'ACHRONIM': 6, 'MODERN_HEBREW': 7,
    'CONFUCIAN': 8, 'DAOIST': 9, 'QURANIC': 10, 'HADITH': 11, 'DEAR_ABBY': 12
}

LANGUAGE_TO_IDX = {
    'hebrew': 0, 'aramaic': 1, 'judeo_arabic': 2,
    'classical_chinese': 3, 'arabic': 4, 'english': 5
}

HOHFELD_TO_IDX = {'OBLIGATION': 0, 'RIGHT': 1, 'LIBERTY': 2, None: 3}

BOND_TYPE_TO_IDX = {
    'HARM_PREVENTION': 0, 'RECIPROCITY': 1, 'AUTONOMY': 2, 'PROPERTY': 3,
    'FAMILY': 4, 'AUTHORITY': 5, 'EMERGENCY': 6, 'CONTRACT': 7,
    'CARE': 8, 'FAIRNESS': 9, 'NONE': 10
}

class MoralDataset(Dataset):
    def __init__(self, passage_ids: set, passages_file: str, bonds_file: str, tokenizer, max_len=64):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = []
        
        with open(passages_file, 'r') as fp, open(bonds_file, 'r') as fb:
            for p_line, b_line in tqdm(zip(fp, fb), desc="Loading", unit="line"):
                p = json.loads(p_line)
                b = json.loads(b_line)
                
                if b['passage_id'] != p['id']:
                    continue
                
                if p['id'] in passage_ids:
                    # Use original text for non-English, English text for English
                    if p['language'] == 'english':
                        text = p['text_english']
                    else:
                        text = p['text_original']
                    
                    self.data.append({
                        'text': text[:1000],
                        'time_period': p['time_period'],
                        'language': p['language'],
                        'source_type': p['source_type'],
                        'hohfeld': b['bond_structure']['hohfeld_state'],
                        'bond': b['bond_structure']['primary_relation']
                    })
        
        print(f"  Loaded {len(self.data):,} samples")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(item['text'], truncation=True, max_length=self.max_len,
                            padding='max_length', return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'time_label': TIME_PERIOD_TO_IDX.get(item['time_period'], 12),
            'language_label': LANGUAGE_TO_IDX.get(item['language'], 5),
            'hohfeld_label': HOHFELD_TO_IDX.get(item['hohfeld'], 3),
            'bond_label': BOND_TYPE_TO_IDX.get(item['bond'], 10),
            'source_type': item['source_type'],
            'language': item['language']
        }

def collate_fn(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        'time_labels': torch.tensor([x['time_label'] for x in batch]),
        'language_labels': torch.tensor([x['language_label'] for x in batch]),
        'hohfeld_labels': torch.tensor([x['hohfeld_label'] for x in batch]),
        'bond_labels': torch.tensor([x['bond_label'] for x in batch]),
        'source_types': [x['source_type'] for x in batch],
        'languages': [x['language'] for x in batch]
    }

print("Model defined!")
print(f"  Time periods: {len(TIME_PERIOD_TO_IDX)}")
print(f"  Languages: {len(LANGUAGE_TO_IDX)}")
print(f"  Bond types: {len(BOND_TYPE_TO_IDX)}")
print(f"  Base batch: {BASE_BATCH_SIZE}")

print_resources("Model defined")


In [None]:
#@title 10. Train BIP Model { display-mode: "form" }
#@markdown Trains on selected splits with adversarial time/language invariance.

#@markdown **Select splits to train:**
TRAIN_TEMPORAL = True  #@param {type:"boolean"}
TRAIN_SEMITIC_TO_ENGLISH = True  #@param {type:"boolean"}
TRAIN_HEBREW_TO_OTHERS = True  #@param {type:"boolean"}
TRAIN_MIXED_BASELINE = True  #@param {type:"boolean"}

import gc
import json
import time
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

mark_task("Train BIP model", "running")

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("="*60)
print("TRAINING BIP MODEL")
print("="*60)
print()

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Select splits to train
splits_to_train = []
if TRAIN_TEMPORAL:
    splits_to_train.append('temporal_ancient_to_modern')
if TRAIN_SEMITIC_TO_ENGLISH:
    splits_to_train.append('semitic_to_english')
if TRAIN_HEBREW_TO_OTHERS:
    splits_to_train.append('hebrew_to_others')
if TRAIN_MIXED_BASELINE:
    splits_to_train.append('mixed_baseline')

print(f"Training {len(splits_to_train)} splits: {splits_to_train}")

all_results = {}

for split_idx, split_name in enumerate(splits_to_train):
    split_start = time.time()
    print()
    print("="*60)
    print(f"TRAINING [{split_idx+1}/{len(splits_to_train)}]: {split_name}")
    print("="*60)
    
    with open("data/splits/all_splits.json", 'r') as f:
        splits = json.load(f)
    split = splits[split_name]
    
    print(f"Description: {split.get('description', '')}")
    print(f"Train: {split['train_size']:,} | Valid: {split['valid_size']:,} | Test: {split['test_size']:,}")
    print()
    
    # Create model
    model = BIPModel().to(device)
    
    # Create datasets
    print("Creating datasets...")
    train_dataset = MoralDataset(set(split['train_ids']), "data/processed/passages.jsonl",
                                  "data/processed/bond_structures.jsonl", tokenizer)
    valid_dataset = MoralDataset(set(split['valid_ids']), "data/processed/passages.jsonl",
                                  "data/processed/bond_structures.jsonl", tokenizer)
    test_dataset = MoralDataset(set(split['test_ids']), "data/processed/passages.jsonl",
                                 "data/processed/bond_structures.jsonl", tokenizer)
    
    if len(train_dataset) == 0:
        print("ERROR: No training data!")
        continue
    
    # Batch size scaling
    batch_size = min(BASE_BATCH_SIZE, max(32, len(train_dataset) // 20))
    num_workers = 4 if IS_L4 or IS_A100 else 2
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              collate_fn=collate_fn, drop_last=True, num_workers=num_workers,
                              pin_memory=True, prefetch_factor=4, persistent_workers=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size*2, shuffle=False,
                              collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size*2, shuffle=False,
                             collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    n_epochs = 5
    best_valid_loss = float('inf')
    patience_counter = 0
    
    print(f"Training {n_epochs} epochs (batch={batch_size})...")
    
    for epoch in range(1, n_epochs + 1):
        epoch_start = time.time()
        model.train()
        total_loss = 0
        n_batches = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch}", unit="batch", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            time_labels = batch['time_labels'].to(device)
            language_labels = batch['language_labels'].to(device)
            hohfeld_labels = batch['hohfeld_labels'].to(device)
            bond_labels = batch['bond_labels'].to(device)
            
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                outputs = model(input_ids, attention_mask, adversarial_lambda=1.0)
                
                loss_time = F.cross_entropy(outputs['time_pred'], time_labels)
                loss_lang = F.cross_entropy(outputs['language_pred'], language_labels)
                loss_hohfeld = F.cross_entropy(outputs['hohfeld_pred'], hohfeld_labels)
                loss_bond = F.cross_entropy(outputs['bond_pred'], bond_labels)
            
            loss = loss_hohfeld + loss_bond + loss_time + loss_lang
            
            optimizer.zero_grad()
            if USE_AMP and scaler:
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            total_loss += loss.item()
            n_batches += 1
        
        # Validation
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for batch in valid_loader:
                outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), 0)
                valid_loss += F.cross_entropy(outputs['bond_pred'], batch['bond_labels'].to(device)).item()
        valid_loss /= len(valid_loader)
        
        epoch_time = time.time() - epoch_start
        print(f"Epoch {epoch}: Loss={total_loss/n_batches:.4f}/{valid_loss:.4f} | {epoch_time:.1f}s")
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f"models/checkpoints/best_model_{split_name}.pt")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= 3:
                print("Early stopping")
                break
    
    # Evaluate
    print("\nEvaluating...")
    model.load_state_dict(torch.load(f"models/checkpoints/best_model_{split_name}.pt"))
    model.eval()
    
    all_preds = {'time': [], 'lang': [], 'bond': []}
    all_labels = {'time': [], 'lang': [], 'bond': []}
    all_languages = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing", unit="batch"):
            outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), 0)
            
            all_preds['time'].extend(outputs['time_pred'].argmax(-1).cpu().tolist())
            all_preds['lang'].extend(outputs['language_pred'].argmax(-1).cpu().tolist())
            all_preds['bond'].extend(outputs['bond_pred'].argmax(-1).cpu().tolist())
            all_labels['time'].extend(batch['time_labels'].tolist())
            all_labels['lang'].extend(batch['language_labels'].tolist())
            all_labels['bond'].extend(batch['bond_labels'].tolist())
            all_languages.extend(batch['languages'])
    
    # Metrics
    time_acc = sum(p == l for p, l in zip(all_preds['time'], all_labels['time'])) / len(all_preds['time'])
    lang_acc = sum(p == l for p, l in zip(all_preds['lang'], all_labels['lang'])) / len(all_preds['lang'])
    bond_acc = sum(p == l for p, l in zip(all_preds['bond'], all_labels['bond'])) / len(all_preds['bond'])
    bond_f1 = f1_score(all_labels['bond'], all_preds['bond'], average='macro', zero_division=0)
    
    # Per-language bond F1
    lang_bond_f1 = {}
    for lang in set(all_languages):
        mask = [l == lang for l in all_languages]
        if sum(mask) > 0:
            preds = [p for p, m in zip(all_preds['bond'], mask) if m]
            labels = [l for l, m in zip(all_labels['bond'], mask) if m]
            lang_bond_f1[lang] = {
                'f1': f1_score(labels, preds, average='macro', zero_division=0),
                'n': sum(mask)
            }
    
    split_time = time.time() - split_start
    
    all_results[split_name] = {
        'time_acc': time_acc,
        'language_acc': lang_acc,
        'bond_acc': bond_acc,
        'bond_f1_macro': bond_f1,
        'per_language_f1': lang_bond_f1,
        'training_time': split_time
    }
    
    print(f"\n{split_name} RESULTS ({split_time/60:.1f} min):")
    print(f"  Time accuracy (adversary): {time_acc:.1%}")
    print(f"  Language accuracy (adversary): {lang_acc:.1%}")
    print(f"  Bond F1 (macro): {bond_f1:.3f}")
    print("  Per-language Bond F1:")
    for lang, metrics in sorted(lang_bond_f1.items(), key=lambda x: -x[1]['n']):
        print(f"    {lang:20s}: F1={metrics['f1']:.3f} (n={metrics['n']:,})")
    
    # Cleanup
    del model, train_dataset, valid_dataset, test_dataset
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

print()
print("="*60)
print("TRAINING COMPLETE")
print("="*60)

mark_task("Train BIP model", "done")


In [None]:
#@title 11. Linear Probe Test { display-mode: "form" }
#@markdown Tests if time/language can be decoded from frozen z_bond.

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

mark_task("Linear probe test", "running")

print("="*60)
print("LINEAR PROBE TEST")
print("="*60)
print()

linear_probe_results = {}

for split_name in all_results.keys():
    if split_name == 'mixed_baseline':
        continue  # Skip baseline for probe test
    
    print(f"\n{'='*50}")
    print(f"PROBE: {split_name}")
    print(f"{'='*50}")
    
    model = BIPModel().to(device)
    model.load_state_dict(torch.load(f"models/checkpoints/best_model_{split_name}.pt"))
    model.eval()
    
    for param in model.parameters():
        param.requires_grad = False
    
    with open("data/splits/all_splits.json", 'r') as f:
        split = json.load(f)[split_name]
    
    test_dataset = MoralDataset(set(split['test_ids']), "data/processed/passages.jsonl",
                                 "data/processed/bond_structures.jsonl", tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=BASE_BATCH_SIZE, collate_fn=collate_fn, num_workers=4)
    
    print("Extracting z_bond...")
    all_z = []
    all_time = []
    all_lang = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Extract", unit="batch"):
            z = model.extract_z_bond(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            all_z.append(z.cpu().numpy())
            all_time.extend(batch['time_labels'].tolist())
            all_lang.extend(batch['language_labels'].tolist())
    
    X = np.vstack(all_z)
    y_time = np.array(all_time)
    y_lang = np.array(all_lang)
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 50/50 split
    np.random.seed(42)
    idx = np.random.permutation(len(X_scaled))
    train_idx, test_idx = idx[:len(idx)//2], idx[len(idx)//2:]
    
    # Time probe
    print("\nFitting time probe...")
    time_probe = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=-1)
    time_probe.fit(X_scaled[train_idx], y_time[train_idx])
    time_probe_acc = (time_probe.predict(X_scaled[test_idx]) == y_time[test_idx]).mean()
    time_chance = 1.0 / len(np.unique(y_time[test_idx]))
    
    # Language probe
    print("Fitting language probe...")
    lang_probe = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=-1)
    lang_probe.fit(X_scaled[train_idx], y_lang[train_idx])
    lang_probe_acc = (lang_probe.predict(X_scaled[test_idx]) == y_lang[test_idx]).mean()
    lang_chance = 1.0 / len(np.unique(y_lang[test_idx]))
    
    time_invariant = time_probe_acc < (time_chance + 0.10)
    lang_invariant = lang_probe_acc < (lang_chance + 0.10)
    
    print(f"\nRESULTS:")
    print(f"  Time probe:     {time_probe_acc:.1%} (chance: {time_chance:.1%}) {'✓ INVARIANT' if time_invariant else '✗ LEAKAGE'}")
    print(f"  Language probe: {lang_probe_acc:.1%} (chance: {lang_chance:.1%}) {'✓ INVARIANT' if lang_invariant else '✗ LEAKAGE'}")
    
    linear_probe_results[split_name] = {
        'time_probe_acc': float(time_probe_acc),
        'time_chance': float(time_chance),
        'time_invariant': bool(time_invariant),
        'lang_probe_acc': float(lang_probe_acc),
        'lang_chance': float(lang_chance),
        'lang_invariant': bool(lang_invariant)
    }
    
    del model
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

with open('results/linear_probe_results.json', 'w') as f:
    json.dump(linear_probe_results, f, indent=2)

print()
print("="*60)
print("PROBE SUMMARY")
print("="*60)
for name, res in linear_probe_results.items():
    time_status = "✓" if res['time_invariant'] else "✗"
    lang_status = "✓" if res['lang_invariant'] else "✗"
    print(f"{name}:")
    print(f"  Time: {res['time_probe_acc']:.1%} vs {res['time_chance']:.1%} {time_status}")
    print(f"  Lang: {res['lang_probe_acc']:.1%} vs {res['lang_chance']:.1%} {lang_status}")

mark_task("Linear probe test", "done")


In [None]:
#@title 12. Final Evaluation { display-mode: "form" }
#@markdown Comprehensive results summary.

import json
import time

mark_task("Evaluate results", "running")

print("="*60)
print("FINAL BIP EVALUATION (v8 Multilingual)")
print("="*60)
print()

# Load baselines
with open("data/splits/baselines.json", 'r') as f:
    baselines = json.load(f)

chance_bond = baselines['chance_bond']
chance_time = baselines['chance_time']
chance_lang = baselines['chance_language']

baseline_f1 = all_results.get('mixed_baseline', {}).get('bond_f1_macro', 0)

print("="*60)
print("CROSS-DOMAIN TRANSFER RESULTS")
print("="*60)

for split_name, res in all_results.items():
    if split_name == 'mixed_baseline':
        continue
    
    print(f"\n{split_name.upper()}")
    print("-"*50)
    
    probe = linear_probe_results.get(split_name, {})
    
    # Invariance
    time_inv = "✓" if probe.get('time_invariant', False) else "✗"
    lang_inv = "✓" if probe.get('lang_invariant', False) else "✗"
    print(f"  Time invariant (probe):     {time_inv} ({probe.get('time_probe_acc', 0):.1%} vs {probe.get('time_chance', 0):.1%})")
    print(f"  Language invariant (probe): {lang_inv} ({probe.get('lang_probe_acc', 0):.1%} vs {probe.get('lang_chance', 0):.1%})")
    
    # Transfer
    bond_f1 = res['bond_f1_macro']
    degradation = baseline_f1 - bond_f1 if baseline_f1 > 0 else 0
    print(f"  Bond F1 (macro):            {bond_f1:.3f}")
    print(f"  vs in-domain baseline:      {baseline_f1:.3f} (degradation: {degradation:+.3f})")
    print(f"  vs chance ({chance_bond:.1%}):          {bond_f1/chance_bond:.1f}x")
    
    # Per-language
    if 'per_language_f1' in res:
        print("  Per-language F1:")
        for lang, metrics in sorted(res['per_language_f1'].items(), key=lambda x: -x[1]['n']):
            print(f"    {lang:20s}: {metrics['f1']:.3f} (n={metrics['n']:,})")

# Verdict
print()
print("="*60)
print("VERDICT")
print("="*60)

# Count successes
n_time_inv = sum(1 for r in linear_probe_results.values() if r.get('time_invariant', False))
n_lang_inv = sum(1 for r in linear_probe_results.values() if r.get('lang_invariant', False))
n_transfer = sum(1 for r in all_results.values() if r.get('bond_f1_macro', 0) > chance_bond * 1.5)

print(f"\nTime-invariant splits:     {n_time_inv}/{len(linear_probe_results)}")
print(f"Language-invariant splits: {n_lang_inv}/{len(linear_probe_results)}")
print(f"Transfer above 1.5x chance: {n_transfer}/{len(all_results)}")

if n_time_inv >= 2 and n_lang_inv >= 2 and n_transfer >= 3:
    verdict = "STRONGLY_SUPPORTED"
    print("\n" + "="*60)
    print("   BIP: STRONGLY SUPPORTED")
    print("   Cross-temporal AND cross-lingual transfer demonstrated")
    print("="*60)
elif n_transfer >= 2:
    verdict = "SUPPORTED"
    print("\n" + "="*60)
    print("   BIP: SUPPORTED")
    print("   Transfer demonstrated with some confound leakage")
    print("="*60)
else:
    verdict = "INCONCLUSIVE"
    print("\n" + "="*60)
    print("   BIP: INCONCLUSIVE")
    print("="*60)

# Save
total_time = time.time() - EXPERIMENT_START

final_results = {
    'model_results': all_results,
    'linear_probe_results': linear_probe_results,
    'verdict': verdict,
    'total_time_minutes': total_time / 60,
    'baselines': baselines,
    'methodology': {
        'languages': ['hebrew', 'aramaic', 'judeo_arabic', 'classical_chinese', 'arabic', 'english'],
        'label_source': 'English translations',
        'encoder': 'paraphrase-multilingual-MiniLM-L12-v2'
    }
}

with open('results/final_results.json', 'w') as f:
    json.dump(final_results, f, indent=2, default=str)

with open(f"{SAVE_DIR}/final_results.json", 'w') as f:
    json.dump(final_results, f, indent=2, default=str)

print(f"\nTotal time: {total_time/60:.1f} minutes")
print(f"Results saved to {SAVE_DIR}")

mark_task("Evaluate results", "done")

print()
print_progress()


In [None]:
#@title 13. Download Results { display-mode: "form" }

import shutil
from google.colab import files

!mkdir -p results

for split_name in all_results.keys():
    src = f"models/checkpoints/best_model_{split_name}.pt"
    if os.path.exists(src):
        !cp "{src}" results/

!cp data/splits/*.json results/ 2>/dev/null || true
!cp results/*.json results/ 2>/dev/null || true

shutil.make_archive('bip_multilingual_v8', 'zip', 'results')
!ls -la results/

files.download('bip_multilingual_v8.zip')