In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# === UD loader (CoNLL-U) ======================================================
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from pathlib import Path

@dataclass
class Sentence:
    lang: str
    words: List[str]
    lemmas: Optional[List[str]] = None
    upos: Optional[List[str]] = None
    heads: Optional[List[int]] = None  # 0..n-1; root -> self
    misc: Optional[Dict[str, Any]] = None

# Project root that contains the UD_* folders
BASE_DIR = Path("/content/drive/MyDrive/MorphWO")

# Map short codes -> UD folders
TB_MAP = {
    "en_ewt":        "UD_English-EWT",
    "de_gsd":        "UD_German-GSD",
    "es_ancora":     "UD_Spanish-AnCora",
    "ru_syntagrus":  "UD_Russian-SynTagRus",
    "tr_imst":       "UD_Turkish-IMST",
    "zh_gsd":        "UD_Chinese-GSD",
}

def _iter_conllu_sentences(conllu_path: Path):
    words, lemmas, upos, heads = [], [], [], []
    with conllu_path.open("r", encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n")
            if not line:  # blank => end of sentence
                if words:
                    yield words, lemmas, upos, heads
                    words, lemmas, upos, heads = [], [], [], []
                continue
            if line.startswith("#"):
                continue
            cols = line.split("\t")
            if len(cols) < 10:
                continue
            tid = cols[0]
            if "-" in tid or "." in tid:
                continue
            try:
                int(tid)
            except ValueError:
                continue

            form, lemma, upo, head_str = cols[1], cols[2], cols[3], cols[6]
            cur_idx = len(words)
            # root -> self index; else 1-based -> 0-based
            if head_str == "0":
                head_idx = cur_idx
            else:
                try:
                    head_idx = int(head_str) - 1
                except ValueError:
                    head_idx = cur_idx

            words.append(form)
            lemmas.append(lemma if lemma != "_" else form)
            upos.append(upo if upo != "_" else "X")
            # clamp impossible heads to self
            if head_idx < 0 or head_idx >= len(words):
                head_idx = len(words) - 1
            heads.append(head_idx)

    if words:
        yield words, lemmas, upos, heads

def _treebank_files(tb_dir: Path) -> List[Path]:
    patterns = ["*dev*.conllu", "*test*.conllu", "*train*.conllu"]
    files: List[Path] = []
    for pat in patterns:
        files.extend(sorted(tb_dir.glob(pat)))
    return files

def load_ud_sentences(lang: str, limit: Optional[int] = None) -> List[Sentence]:
    tb_name = TB_MAP.get(lang, lang)
    tb_dir = BASE_DIR / tb_name
    files = _treebank_files(tb_dir)
    sents: List[Sentence] = []
    for p in files:
        for words, lemmas, upos, heads in _iter_conllu_sentences(p):
            sents.append(Sentence(lang=lang, words=words, lemmas=lemmas, upos=upos, heads=heads))
            if limit is not None and len(sents) >= limit:
                return sents
    return sents


In [None]:
# === Imports & Environment (vision-free, single source of truth) ===
from __future__ import annotations

# If on Colab, install deps once (no torchvision)
try:
    IN_COLAB = "google.colab" in str(get_ipython())
except NameError:
    IN_COLAB = False

if IN_COLAB:
    # Keep lean; no vision wheels.
    !pip -q install --upgrade torch pandas transformers --progress-bar off

# --- Standard library
import os
import sys
import json
import math
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# --- Disable TorchVision & other vision stubs BEFORE importing transformers
# This sidesteps the "operator torchvision::nms does not exist" path entirely.
sys.modules["torchvision"] = None
sys.modules["cv2"] = None  # extra belt-and-suspenders; not strictly needed

# Optional: keep tokenizers quiet / deterministic-ish
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# --- Third-party
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# --- Torch deterministic baseline; run-specific seeds still set via set_seeds(...)
torch.manual_seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# --- Kill TorchVision once and for all (no restart needed) ---
import os, sys, subprocess, pkgutil

# 1) Tell transformers not to import torchvision at all
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

# 2) If torchvision is installed, uninstall it to avoid binary-ops import paths
if pkgutil.find_loader("torchvision") is not None:
    subprocess.run(["pip", "uninstall", "-y", "torchvision"], check=False)

# 3) Belt-and-suspenders: stub the module so any stray import is a no-op
sys.modules["torchvision"] = None



[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.1 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.
torchaudio 2.6.0+cu124 requires torch==2.6.0, but you have torch 2.8.0 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.8.0 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.8.0 which is incompatible.[0m[31m
[0m

In [None]:
# === Globals & helpers (single source of truth) ===
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple, Any
import random, numpy as np, torch

# ---------------- RNG ----------------
def set_seeds(seed: int = 123):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    try:
        from transformers import set_seed as hf_set_seed
        hf_set_seed(seed)
    except Exception:
        pass

class RNG:
    def __init__(self, seed: int):
        self.py = random.Random(seed)
        self.np = np.random.default_rng(seed)

# ---------------- Language maps ----------------
LANG_PUNCT: Dict[str, set] = {
    "en": {".", ",", ";", ":", "!", "?", "(", ")", "[", "]", "{", "}", "'", '"', "-", "–", "—", "…"},
    "de": {".", ",", ";", ":", "!", "?", "„", "“", "‚", "‘", "(", ")", "-", "–", "—", "…"},
    "es": {".", ",", ";", ":", "¡", "!", "¿", "?", "(", ")", "-", "–", "—", "…"},
    "ru": {".", ",", ";", ":", "!", "?", "«", "»", "(", ")", "-", "–", "—", "…"},
    "tr": {".", ",", ";", ":", "!", "?", "(", ")", "-", "–", "—", "…"},
    "zh": {"。","，","；","：","！","？","（","）","、","《","》","—","–","-","…"},
}
LANG_STOP: Dict[str, set] = {
    "en": {"the","a","an","of","in","to","and","is","are","was","were","be","been","being"},
    "de": {"der","die","das","und","zu","ist","sind","war","waren","ein","eine","einer"},
    "es": {"el","la","los","las","y","de","a","es","son","fue","un","una"},
    "ru": {"и","в","на","с","к","по","что","это","тот","эта","есть","был","были"},
    "tr": {"ve","bir","bu","şu","o","de","da","ile","için","mi","mu","mü"},
    "zh": set(),
}

def lang_key(lang: str) -> str:
    return (lang or "").split("_")[0].lower()

def is_punct(tok: str, lang: str) -> bool:
    return tok in LANG_PUNCT.get(lang_key(lang), set())

def is_stop(tok: str, lang: str) -> bool:
    return (tok or "").lower() in LANG_STOP.get(lang_key(lang), set())

def default_stopword_filter(word: str, lang: str) -> bool:
    """True => DO NOT MASK this token."""
    w = word if isinstance(word, str) else str(word)
    lk = lang_key(lang)
    return (w in LANG_PUNCT.get(lk, set())) or (w.lower() in LANG_STOP.get(lk, set()))

# ---------------- Tokenization utils ----------------
def num_pieces(tokenizer, word: str) -> int:
    try:
        return len(tokenizer.encode(word, add_special_tokens=False))
    except TypeError:
        enc = tokenizer(word, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
        return len(enc["input_ids"])

def words_to_pieces(tokenizer, words: List[str]) -> Tuple[List[str], List[List[int]]]:
    pieces, mapping = [], []
    idx = 0
    for w in words:
        ids = tokenizer.encode(w, add_special_tokens=False)
        toks = tokenizer.convert_ids_to_tokens(ids)
        mapping.append(list(range(idx, idx + len(toks))))
        pieces.extend(toks); idx += len(toks)
    return pieces, mapping

def decode_ids(tokenizer, ids: List[int]) -> str:
    return tokenizer.decode(ids, skip_special_tokens=True).strip()

def reconstruct_from_tokenpieces(t: str) -> str:
    # Fallback when you only have token strings (not IDs)
    return t.replace("##", "").replace("▁", "").strip()

# ---------------- Lemmas ----------------
def lemmatize_words(lang: str, words: List[str], ud_lemmas: Optional[List[str]] = None) -> List[str]:
    """UD-only; identity fallback. No Stanza, no lowercasing side-effects."""
    if ud_lemmas is not None and len(ud_lemmas) == len(words):
        return [(l if l and l != "_" else w) for w, l in zip(words, ud_lemmas)]
    return list(words)

# ---------------- Permutations ----------------
def fully_scramble_perm(n: int, rnd: random.Random) -> List[int]:
    perm = list(range(n)); rnd.shuffle(perm); return perm

def head_scramble_perm(n: int, heads: Optional[List[int]], rnd: random.Random) -> List[int]:
    if not heads or len(heads) != n:
        return fully_scramble_perm(n, rnd)
    perm = list(range(n))
    kids = {i: [] for i in range(n)}
    for dep, h in enumerate(heads):
        if dep != h and 0 <= h < n:
            kids[h].append(dep)
    for h, ch in kids.items():
        if ch:
            c = ch[0]
            perm[h], perm[c] = perm[c], perm[h]
    return perm

def apply_perm_multi(arrays: List[List], perm: List[int]) -> Tuple[List[List], Dict[int,int], Dict[int,int]]:
    n = len(perm)
    fwd = {i: j for i, j in enumerate(perm)}
    inv = {j: i for i, j in enumerate(perm)}
    outs: List[List] = []
    for arr in arrays:
        assert len(arr) == n
        out = [None]*n
        for i, j in enumerate(perm):
            out[j] = arr[i]
        assert all(x is not None for x in out)
        outs.append(out)
    return outs, fwd, inv

# ---------------- Role-aware partial scrambling ----------------
def role_aware_partial_with_perm(
    words: List[str],
    upos: Optional[List[str]],
    heads: Optional[List[int]],
    rnd: random.Random,
    frac: float = 0.35,
    content_upos: Tuple[str, ...] = ("NOUN","PROPN","VERB","ADJ","ADV"),
) -> Dict[str, Any]:
    n = len(words)
    if n == 0:
        return {"words2": [], "perm": [], "moved_indices": [], "role_counts": {}}
    frac = max(0.0, min(1.0, float(frac)))
    perm = list(range(n))
    if not upos or len(upos) != n:
        k = max(1, int(round(frac * n)))
        idxs = rnd.sample(range(n), min(k, n))
        shuffled = idxs[:]; rnd.shuffle(shuffled)
        for src, dst in zip(idxs, shuffled): perm[src] = dst
        (words2,), _, _ = apply_perm_multi([words], perm)
        return {"words2": words2, "perm": perm,
                "moved_indices": [i for i in range(n) if perm[i]!=i],
                "role_counts": {"fallback": len(idxs)}}
    cands = [i for i,u in enumerate(upos) if u in content_upos]
    if not cands:
        return {"words2": words[:], "perm": perm, "moved_indices": [], "role_counts": {"no_content": 0}}
    k = max(1, int(round(frac * len(cands))))
    chosen = rnd.sample(cands, min(k, len(cands)))
    shuffled = chosen[:]; rnd.shuffle(shuffled)
    for src, dst in zip(chosen, shuffled): perm[src] = dst
    (words2,), _, _ = apply_perm_multi([words], perm)
    role_counts: Dict[str,int] = {}
    for i in chosen: role_counts[upos[i]] = role_counts.get(upos[i],0)+1
    return {"words2": words2, "perm": perm,
            "moved_indices": [i for i in range(n) if perm[i]!=i],
            "role_counts": role_counts}

# ---------------- Variant generator (central) ----------------
def generate_variants(sent, rnd: random.Random, frac: float = 0.35) -> Dict[str, Dict[str, List]]:
    base = list(sent.words); n = len(base)
    base_lem = lemmatize_words(sent.lang, base, getattr(sent,"lemmas",None))
    out = {
        "Original": {"words": base[:], "perm": list(range(n))},
        "Original+Lemma": {"words": base_lem[:], "perm": list(range(n))},
    }
    # full
    perm = fully_scramble_perm(n, rnd)
    outs,_,_ = apply_perm_multi([base, base_lem], perm)
    out["FullyScrambled"] = {"words": outs[0], "perm": perm}
    out["FullyScrambled+Lemma"] = {"words": outs[1], "perm": perm}
    # head
    perm_h = head_scramble_perm(n, getattr(sent,"heads",None), rnd)
    (hs_words,),_,_ = apply_perm_multi([base], perm_h)
    out["HeadScrambled"] = {"words": hs_words, "perm": perm_h}
    # partial
    res = role_aware_partial_with_perm(base, getattr(sent,"upos",None), getattr(sent,"heads",None), rnd, frac=frac)
    out["PartiallyScrambled"] = {"words": res["words2"], "perm": res["perm"]}
    (ps_lem,),_,_ = apply_perm_multi([base_lem], res["perm"])
    out["PartiallyScrambled+Lemma"] = {"words": ps_lem, "perm": res["perm"]}
    return out

# ---------------- Target selection (central) ----------------
def pick_mask_target(
    words, lang, *, tokenizer, rnd,
    stopword_filter=None,
    single_piece_only: bool = True,
    max_span_pieces: Optional[int] = None,
):
    stopword_filter = stopword_filter or default_stopword_filter
    _, mapping = words_to_pieces(tokenizer, words)
    cands = []
    for i, w in enumerate(words):
        if i == 0: continue
        if stopword_filter(w, lang): continue
        L = len(mapping[i])
        if L == 0: continue
        if single_piece_only and L != 1: continue
        if (max_span_pieces is not None) and (L > max_span_pieces): continue
        cands.append(i)
    if not cands: return None, None
    idx = rnd.choice(cands)
    return idx, words[idx]


In [None]:
# === EVALUATION (SPAN-LEVEL WORD PREDICTION) =================================
from typing import List, Dict, Any, Optional
import torch

def _decode_piece_ids(tokenizer, ids: List[int]) -> str:
    return decode_ids(tokenizer, ids)

@torch.inference_mode()
def evaluate_word_span(
    model,
    tokenizer,
    words: List[str],
    mask_word_index: int,
    *,
    top_k: int = 5,
    max_span_pieces: int = 4,
    beam_size: int = 1,  # 1=greedy
) -> Dict[str, Any]:
    """
    Whole-word evaluation: mask ALL pieces of the target word, then fill them
    (greedy or tiny beam over the span positions). Returns span-level word@1/5
    via reconstructed strings.
    """
    if mask_word_index is None or mask_word_index < 0 or mask_word_index >= len(words):
        return {"error": "bad_target_index"}

    piece_tokens, mapping = words_to_pieces(tokenizer, words)
    span = mapping[mask_word_index]
    if not span:
        return {"error": "empty_piece_span"}

    if len(span) > max_span_pieces:
        return {"error": f"span_too_long_{len(span)}"}

    # gold string
    all_ids = tokenizer.convert_tokens_to_ids(piece_tokens)
    gold_piece_ids = [all_ids[i] for i in span]
    gold_word = _decode_piece_ids(tokenizer, gold_piece_ids)

    # build masked input
    mask_id = getattr(tokenizer, "mask_token_id", None)
    if mask_id is None or mask_id < 0:
        return {"error": "no_mask_token_id"}
    masked_ids = list(all_ids)
    for pos in span:
        masked_ids[pos] = mask_id

    device = next(model.parameters()).device
    model.eval()

    # Greedy / tiny beam decoding over the span positions
    import math
    candidates = [(masked_ids, 0.0)]  # (ids_list, logprob)
    beam = max(1, int(beam_size))
    for pos in span:
        new_cands = []
        for ids, score in candidates:
            input_ids = torch.tensor([ids], device=device)
            logits = model(input_ids=input_ids).logits[0, pos]
            logp = torch.log_softmax(logits, dim=-1)
            vals, idxs = torch.topk(logp, k=beam)
            for v, tid in zip(vals.tolist(), idxs.tolist()):
                nxt = list(ids); nxt[pos] = tid
                new_cands.append((nxt, score + float(v)))
        new_cands.sort(key=lambda x: x[1], reverse=True)
        candidates = new_cands[:beam]

    # Top candidate and top-K distinct words (from beams)
    top_ids = candidates[0][0]
    pred_word = _decode_piece_ids(tokenizer, [top_ids[i] for i in span])

    topk_span_words = []
    seen = set()
    for ids, _ in candidates[:max(top_k, 1)]:
        w = _decode_piece_ids(tokenizer, [ids[i] for i in span])
        if w not in seen:
            topk_span_words.append(w); seen.add(w)

    return {
        "target_index": mask_word_index,
        "gold_word": gold_word,
        "gold_num_pieces": len(span),
        "reconstructed_prediction": pred_word,
        "word_level_match": (pred_word.lower() == gold_word.lower()),
        "top_k_predictions": topk_span_words,  # span-level strings
    }


# === RUNNER ===================================================================
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple
from transformers import AutoTokenizer, AutoModelForMaskedLM
import json

@dataclass
class RunParams:
    model_name: str = "bert-base-multilingual-cased"
    languages: Tuple[str, ...] = ("en_ewt",)
    seed: int = 123
    top_k: int = 5
    out_dir: str = "/content/drive/MyDrive/MorphWO/outputs"
    limit_per_lang: int = 50
    partial_frac: float = 0.35
    max_span_pieces: int = 4   # cap for evaluation
    beam_size: int = 1         # 1 = greedy

def run_experiment(params: RunParams) -> str:
    """
    Generates per-condition variants, remaps the original target index, evaluates at span level,
    and writes a FLAT JSONL (one row per (item, condition)).
    """
    set_seeds(params.seed)
    rnd = RNG(params.seed).py

    # Per-language *selection* caps (NOT eval caps). TR gets a tighter selection cap by default.
    #selection_cap_by_lang = {
    #   "tr_imst": min(3, params.max_span_pieces),  # help lift Orig@1 for TR
        # others default to None (use full eval cap)
    #}
    selection_cap_by_lang = {}

    out_root = Path(params.out_dir) / params.model_name
    out_root.mkdir(parents=True, exist_ok=True)
    #out_path = out_root / "results_items.jsonl"
    out_path = out_root / f"results_items_seed{params.seed}.jsonl"


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tok = AutoTokenizer.from_pretrained(params.model_name)
    mdl = AutoModelForMaskedLM.from_pretrained(params.model_name).to(device).eval()

    n_written = 0
    # inside run_experiment(params) before `with out_path.open(...):`
    stamp = f"seed{params.seed}"
    out_path = out_root / f"results_items_{stamp}.jsonl"

    with out_path.open("w", encoding="utf-8") as f:
        for lang in params.languages:
            sents = load_ud_sentences(lang, limit=params.limit_per_lang)
            for s in sents:
                words = s.words
                if not words:
                    continue

                # Pick target on ORIGINAL tokens; allow multi-piece; apply per-language selection cap
                sel_cap = selection_cap_by_lang.get(lang, None)
                idx, tokstr = pick_mask_target(
                    words, lang, tokenizer=tok, rnd=rnd,
                    stopword_filter=None,
                    single_piece_only=False,
                    max_span_pieces=sel_cap
                )
                if idx is None:
                    continue

                # Build variants (+Lemma applied correctly)
                variants = generate_variants(s, rnd, frac=params.partial_frac)

                # Evaluate all conditions
                for condition, payload in variants.items():
                    w2 = payload["words"]; perm = payload["perm"]
                    # remap target index
                    if idx >= len(perm):  # defensive
                        continue
                    idx2 = perm[idx]

                    # Optional: skip "fake" partials that moved nothing (esp. for TR)
                    if lang == "tr_imst" and condition.startswith("PartiallyScrambled"):
                        moved = sum(1 for i, j in enumerate(perm) if i != j)
                        if moved < 2:
                            continue

                    res = evaluate_word_span(
                        mdl, tok, w2, idx2,
                        top_k=params.top_k,
                        max_span_pieces=params.max_span_pieces,
                        beam_size=params.beam_size,
                    )
                    if "error" in res:
                        continue

                    row = {
                        "model_name": params.model_name,
                        "lang": lang,
                        "condition": condition,
                        "words": w2,
                        "target_index": idx2,
                        "target_token": tokstr,
                        "word_level_match": bool(res["word_level_match"]),
                        "gold_word": res["gold_word"],
                        "gold_num_pieces": int(res["gold_num_pieces"]),
                        "reconstructed_prediction": res["reconstructed_prediction"],
                        "top_k_predictions": res["top_k_predictions"],
                    }
                    f.write(json.dumps(row, ensure_ascii=False) + "\n")
                    n_written += 1

    print(f"Wrote: {out_path}  (rows={n_written})")
    return str(out_path)


# === SUMMARIZATION (FLAT) =====================================================
import pandas as pd
import numpy as np

def summarize_items_flat(jsonl_path: str, balance: bool = True) -> pd.DataFrame:
    """
    Flat schema summarizer. Works for span-eval (no single-piece filtering).
    """
    rows = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            model = rec.get("model_name", "")
            lang  = rec.get("lang", "")
            cond  = rec.get("condition", "")
            gold  = (rec.get("gold_word") or "").strip()
            pred  = (rec.get("reconstructed_prediction") or "").strip()
            topk  = [ (t or "").strip() for t in rec.get("top_k_predictions", [])[:5] ]

            wl = rec.get("word_level_match", None)
            word_at_1 = 1.0 if isinstance(wl, bool) and wl else float(pred.lower() == gold.lower())
            word_at_5 = float(any(gold.lower() == t.lower() for t in topk))

            # Light diagnostic rates
            pred_punct = float(is_punct(pred, lang))
            pred_stop  = float(is_stop(pred, lang))

            rows.append({
                "model_name": model,
                "lang": lang,
                "condition": cond,
                "word_at_1": word_at_1,
                "word_at_5": word_at_5,
                "pred_punct_rate": pred_punct,
                "pred_stop_rate": pred_stop,
            })

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    df_bal = df.copy()
    if balance:
        # Downsample per (model,lang) so all conditions have equal N
        chunks = []
        for (m, l), g in df_bal.groupby(["model_name","lang"], sort=False):
            counts = g.groupby("condition").size()
            if (counts == 0).any():
                continue
            n = int(counts.min())
            for cond, sub in g.groupby("condition", sort=False):
                chunks.append(sub.head(n))
        if chunks:
            df_bal = pd.concat(chunks, ignore_index=True)

    summary = (df_bal.groupby(["model_name","lang","condition"], sort=False)[
                  ["word_at_1", "word_at_5", "pred_punct_rate", "pred_stop_rate"]
               ].mean().reset_index())
    Ns = (df_bal.groupby(["model_name","lang","condition"])["word_at_1"]
                 .count().rename("N").reset_index())
    summary = summary.merge(Ns, on=["model_name","lang","condition"], how="left")
    return summary


# === SENSITIVITIES & ABSOLUTE DELTAS ==========================================
def compute_sensitivities(df_summary: pd.DataFrame) -> pd.DataFrame:
    wide = (df_summary.pivot_table(index=["model_name","lang"],
                                   columns="condition",
                                   values="word_at_1",
                                   aggfunc="mean")
            .reset_index())
    def col(name): return wide[name] if name in wide.columns else pd.Series([np.nan]*len(wide))
    orig = col("Original")
    den  = orig.replace(0, np.nan)

    S_full    = (orig - col("FullyScrambled"))       / den
    S_partial = (orig - col("PartiallyScrambled"))   / den
    S_morph   = (orig - col("Original+Lemma"))       / den
    I_add_raw = (orig - col("FullyScrambled+Lemma")) / den
    I_add     = I_add_raw - (S_full + S_morph)

    out = wide[["model_name","lang"]].copy()
    out["S_full"]    = S_full
    out["S_partial"] = S_partial
    out["S_morph"]   = S_morph
    out["I_add"]     = I_add
    return out

def compute_absolute_deltas(df_summary: pd.DataFrame) -> pd.DataFrame:
    wide = (df_summary.pivot_table(index=["model_name","lang"],
                                   columns="condition",
                                   values="word_at_1",
                                   aggfunc="mean")
            .reset_index())
    def g(name): return wide[name] if name in wide.columns else pd.Series([np.nan]*len(wide))
    out = wide[["model_name","lang"]].copy()
    out["D_full"]    = g("Original") - g("FullyScrambled")
    out["D_partial"] = g("Original") - g("PartiallyScrambled")
    out["D_morph"]   = g("Original") - g("Original+Lemma")
    out["D_add_raw"] = g("Original") - g("FullyScrambled+Lemma")
    out["note"]      = np.where(g("Original").fillna(0) == 0, "Orig@1==0 ⇒ S undefined", "")
    return out


# === EXECUTE ==================================================================
# Toggle span evaluation ON (this code path is built for span eval)
SPAN_EVAL = True

params = RunParams(
    model_name="bert-base-multilingual-cased",
    languages=("en_ewt","de_gsd","es_ancora","ru_syntagrus","tr_imst","zh_gsd"),
    seed=123, top_k=5,
    out_dir="/content/drive/MyDrive/MorphWO/outputs",
    limit_per_lang=300,   # bump to stabilize Orig@1 esp. for TR
    partial_frac=0.5,     # stronger partial scrambling
    max_span_pieces=6,    # eval cap
    beam_size=8,          # meaningful top-5 at span level
)

out_path = run_experiment(params)

# Summaries (balanced AND unbalanced, saved next to results)
balanced_summary   = summarize_items_flat(out_path, balance=True)
unbalanced_summary = summarize_items_flat(out_path, balance=False)

model_dir = Path(params.out_dir) / params.model_name
model_dir.mkdir(parents=True, exist_ok=True)
balanced_summary.sort_values(["model_name","lang","condition"]).to_csv(model_dir / "summary_balanced.csv", index=False)
unbalanced_summary.sort_values(["model_name","lang","condition"]).to_csv(model_dir / "summary_unbalanced.csv", index=False)

# QA prints
with open(out_path, "r", encoding="utf-8") as _f:
    rows_written = sum(1 for _ in _f)
print(f"Rows written: {rows_written}")
print("Balanced shape:", balanced_summary.shape, "Unbalanced shape:", unbalanced_summary.shape)
print("Conditions present (balanced):", sorted(balanced_summary["condition"].unique().tolist()))

# Ns per group (balanced)
Ns_bal = (balanced_summary[["model_name","lang","condition","N"]]
          .sort_values(["model_name","lang","condition"]))
print("\nNs per group (balanced):")
print(Ns_bal.to_string(index=False))

# Sensitivities (balanced & unbalanced)
sens_bal = compute_sensitivities(balanced_summary)
# Patch TR (and similar) when Orig@1==0 nukes S
if (sens_bal["S_full"].isna().any()):
    print("\n[WARN] Some balanced sensitivities undefined (Orig@1==0). Using unbalanced values for those langs.")
    sens_unbal = compute_sensitivities(unbalanced_summary)
    for col in ["S_full","S_partial","S_morph","I_add"]:
        sens_bal[col] = sens_bal[col].fillna(sens_unbal[col])


sens_unb = compute_sensitivities(unbalanced_summary)
print("\nSensitivities (balanced):")
print(sens_bal.sort_values(["model_name","lang"]).to_string(index=False))
print("\nSensitivities (unbalanced):")
print(sens_unb.sort_values(["model_name","lang"]).to_string(index=False))

# Absolute deltas (balanced & unbalanced)
abs_bal = compute_absolute_deltas(balanced_summary)
abs_unb = compute_absolute_deltas(unbalanced_summary)
print("\nAbsolute deltas (balanced):")
print(abs_bal.sort_values(["model_name","lang"]).to_string(index=False))
print("\nAbsolute deltas (unbalanced):")
print(abs_unb.sort_values(["model_name","lang"]).to_string(index=False))

# Save deltas
abs_bal.to_csv(model_dir / "abs_deltas_balanced.csv", index=False)
abs_unb.to_csv(model_dir / "abs_deltas_unbalanced.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed123.jsonl  (rows=11959)
Rows written: 11959
Balanced shape: (42, 8) Unbalanced shape: (42, 8)
Conditions present (balanced): ['FullyScrambled', 'FullyScrambled+Lemma', 'HeadScrambled', 'Original', 'Original+Lemma', 'PartiallyScrambled', 'PartiallyScrambled+Lemma']

Ns per group (balanced):
                  model_name         lang                condition   N
bert-base-multilingual-cased       de_gsd           FullyScrambled 300
bert-base-multilingual-cased       de_gsd     FullyScrambled+Lemma 300
bert-base-multilingual-cased       de_gsd            HeadScrambled 300
bert-base-multilingual-cased       de_gsd                 Original 300
bert-base-multilingual-cased       de_gsd           Original+Lemma 300
bert-base-multilingual-cased       de_gsd       PartiallyScrambled 300
bert-base-multilingual-cased       de_gsd PartiallyScrambled+Lemma 300
bert-base-multilingual-cased       en_ewt       

In [None]:
# === MULTI-RUN HARNESS (models × seeds, robust outputs) ======================
from copy import deepcopy
from datetime import datetime
import shutil

# Models: mBERT (baseline), XLM-R (modern), ReM-BERT (strong multilingual)
MODEL_GRID = [
    "bert-base-multilingual-cased",
    "xlm-roberta-base",
    "google/rembert",
    # "xlm-roberta-large",  # optional, heavier
]

# Seeds: 3 is usually enough for a stable mean/CI
SEEDS = [123, 456, 789]

# Languages: keep consistent across runs
LANGS = ("en_ewt","de_gsd","es_ancora","ru_syntagrus","tr_imst","zh_gsd")

def run_grid(models=MODEL_GRID, seeds=SEEDS):
    all_bal = []
    all_unb = []

    for m in models:
        for s in seeds:
            print(f"\n=== Running {m} (seed={s}) ===")
            p = deepcopy(params)
            p.model_name  = m
            p.seed        = s
            p.languages   = LANGS

            # Reasonable size (increase over 50) — sweet spot for speed × stability
            p.limit_per_lang = max(400, params.limit_per_lang)

            # Keep span settings consistent across models
            p.max_span_pieces = params.max_span_pieces
            p.beam_size       = params.beam_size
            p.partial_frac    = params.partial_frac

            # Optional trims for very heavy models
            if m in {"xlm-roberta-large"}:
                p.limit_per_lang = min(p.limit_per_lang, 150)

            try:
                # Run experiment (writes .../<model>/results_items.jsonl)
                out_path = run_experiment(p)
                print(f"[DONE] {m} seed={s} → {out_path}")

                # Copy JSONL to a per-seed file so we don't overwrite across runs
                run_dir = Path(p.out_dir) / p.model_name
                run_dir.mkdir(parents=True, exist_ok=True)
                per_seed_jsonl = run_dir / f"results_items_seed{s}.jsonl"
                try:
                    shutil.copyfile(out_path, per_seed_jsonl)
                    print(f"[SAVED] {per_seed_jsonl}")
                except Exception as e:
                    print(f"[WARN] Could not copy JSONL for archival: {e}")

                # Per-run summaries
                bal   = summarize_items_flat(out_path, balance=True)
                unbal = summarize_items_flat(out_path, balance=False)

                # Tag with run metadata
                bal["run_seed"]   = s; bal["run_model"]   = m
                unbal["run_seed"] = s; unbal["run_model"] = m

                # Save per-run summaries
                bal.sort_values(["model_name","lang","condition"]).to_csv(run_dir / f"summary_balanced_seed{s}.csv", index=False)
                unbal.sort_values(["model_name","lang","condition"]).to_csv(run_dir / f"summary_unbalanced_seed{s}.csv", index=False)

                # Diagnostics: by piece length & by target position
                try:
                    piece_bins = bucket_by_pieces(out_path)
                    piece_bins.to_csv(run_dir / f"by_piece_length_seed{s}.csv", index=False)
                    pos_bins = bucket_by_position(out_path)
                    pos_bins.to_csv(run_dir / f"by_position_seed{s}.csv", index=False)
                except Exception as e:
                    print(f"[WARN] Diagnostics failed for {m} seed={s}: {e}")

                all_bal.append(bal)
                all_unb.append(unbal)

            except Exception as e:
                print(f"[ERROR] Run failed for {m} seed={s}: {e}")
                continue

    # Combine all runs
    all_bal = pd.concat(all_bal, ignore_index=True) if all_bal else pd.DataFrame()
    all_unb = pd.concat(all_unb, ignore_index=True) if all_unb else pd.DataFrame()

    combo_dir = Path(params.out_dir) / "_combined"
    combo_dir.mkdir(parents=True, exist_ok=True)
    all_bal.to_csv(combo_dir / "all_runs_balanced.csv", index=False)
    all_unb.to_csv(combo_dir / "all_runs_unbalanced.csv", index=False)
    print("\nSaved combined CSVs to:", combo_dir)
    return all_bal, all_unb

all_bal, all_unb = run_grid()

# === CROSS-RUN AGGREGATES (mean, std, 95% CI) ================================
def agg_mean_ci(df: pd.DataFrame, group_cols=("run_model","lang","condition")) -> pd.DataFrame:
    if df.empty:
        return df
    g = (df.groupby(list(group_cols), sort=False)["word_at_1"]
           .agg(["mean","std","count"])
           .reset_index()
           .rename(columns={"mean":"word_at_1_mean","std":"word_at_1_std","count":"runs"}))
    g["ci95"] = np.where(g["runs"]>=2, 1.96 * (g["word_at_1_std"] / np.sqrt(g["runs"])), np.nan)
    return g

bal_agg = agg_mean_ci(all_bal, group_cols=("run_model","lang","condition"))
unb_agg = agg_mean_ci(all_unb, group_cols=("run_model","lang","condition"))

def sensitivities_from_agg(agg_df: pd.DataFrame) -> pd.DataFrame:
    if agg_df.empty:
        return agg_df
    wide = (agg_df
            .pivot_table(index=["run_model","lang"], columns="condition", values="word_at_1_mean")
            .reset_index())
    def col(name): return wide[name] if name in wide.columns else pd.Series([np.nan]*len(wide))
    orig = col("Original"); den = orig.replace(0, np.nan)

    out = wide[["run_model","lang"]].copy()
    out["S_full"]    = (orig - col("FullyScrambled"))       / den
    out["S_partial"] = (orig - col("PartiallyScrambled"))   / den
    out["S_morph"]   = (orig - col("Original+Lemma"))       / den
    out["I_add"]     = ((orig - col("FullyScrambled+Lemma")) / den) - (out["S_full"] + out["S_morph"])
    return out

sens_bal_runs = sensitivities_from_agg(bal_agg)
sens_unb_runs = sensitivities_from_agg(unb_agg)

# Save aggregates
combo_dir = Path(params.out_dir) / "_combined"
bal_agg.to_csv(combo_dir / "agg_balanced_mean_ci.csv", index=False)
unb_agg.to_csv(combo_dir / "agg_unbalanced_mean_ci.csv", index=False)
sens_bal_runs.to_csv(combo_dir / "sensitivities_balanced_across_runs.csv", index=False)
sens_unb_runs.to_csv(combo_dir / "sensitivities_unbalanced_across_runs.csv", index=False)

print("\n=== Cross-run sensitivities (balanced) ===")
print(sens_bal_runs.sort_values(["run_model","lang"]).to_string(index=False))

# (Optional) Quick rank table by Original word@1 across models
if not bal_agg.empty:
    orig_rank = (bal_agg[bal_agg["condition"]=="Original"]
                 .sort_values(["lang","word_at_1_mean"], ascending=[True,False]))
    orig_rank.to_csv(combo_dir / "original_acc_ranks.csv", index=False)
    print("\nSaved per-language model ranks on Original condition.")



# === CROSS-RUN AGGREGATES (mean, std, 95% CI) ================================
def agg_mean_ci(df: pd.DataFrame, group_cols=("run_model","lang","condition")) -> pd.DataFrame:
    if df.empty:
        return df
    g = (df.groupby(list(group_cols), sort=False)["word_at_1"]
           .agg(["mean","std","count"])
           .reset_index()
           .rename(columns={"mean":"word_at_1_mean","std":"word_at_1_std","count":"runs"}))
    g["ci95"] = np.where(g["runs"]>=2, 1.96 * (g["word_at_1_std"] / np.sqrt(g["runs"])), np.nan)
    return g

bal_agg = agg_mean_ci(all_bal, group_cols=("run_model","lang","condition"))
unb_agg = agg_mean_ci(all_unb, group_cols=("run_model","lang","condition"))

# Sensitivities averaged over runs (use the mean table)
def sensitivities_from_agg(agg_df: pd.DataFrame) -> pd.DataFrame:
    if agg_df.empty:
        return agg_df
    wide = (agg_df
            .pivot_table(index=["run_model","lang"], columns="condition", values="word_at_1_mean")
            .reset_index())
    def col(name): return wide[name] if name in wide.columns else pd.Series([np.nan]*len(wide))
    orig = col("Original"); den = orig.replace(0, np.nan)

    out = wide[["run_model","lang"]].copy()
    out["S_full"]    = (orig - col("FullyScrambled"))       / den
    out["S_partial"] = (orig - col("PartiallyScrambled"))   / den
    out["S_morph"]   = (orig - col("Original+Lemma"))       / den
    out["I_add"]     = ((orig - col("FullyScrambled+Lemma")) / den) - (out["S_full"] + out["S_morph"])
    return out

sens_bal_runs = sensitivities_from_agg(bal_agg)
sens_unb_runs = sensitivities_from_agg(unb_agg)

# Save aggregates
combo_dir = Path(params.out_dir) / "_combined"
bal_agg.to_csv(combo_dir / "agg_balanced_mean_ci.csv", index=False)
unb_agg.to_csv(combo_dir / "agg_unbalanced_mean_ci.csv", index=False)
sens_bal_runs.to_csv(combo_dir / "sensitivities_balanced_across_runs.csv", index=False)
sens_unb_runs.to_csv(combo_dir / "sensitivities_unbalanced_across_runs.csv", index=False)

print("\n=== Cross-run sensitivities (balanced) ===")
print(sens_bal_runs.sort_values(["run_model","lang"]).to_string(index=False))


# === (Optional) Quick rank table by Original word@1 across models =============
if not bal_agg.empty:
    orig_rank = (bal_agg[bal_agg["condition"]=="Original"]
                 .sort_values(["lang","word_at_1_mean"], ascending=[True,False]))
    orig_rank.to_csv(combo_dir / "original_acc_ranks.csv", index=False)
    print("\nSaved per-language model ranks on Original condition.")

import time
t0 = time.time()
out_path = run_experiment(p)
print(f"[DONE] {m} seed={s} → {out_path}  ({time.time()-t0:.1f}s)")



=== Running bert-base-multilingual-cased (seed=123) ===


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed123.jsonl  (rows=15850)
[DONE] bert-base-multilingual-cased seed=123 → /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed123.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed123.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed123.jsonl') are the same file
[WARN] Diagnostics failed for bert-base-multilingual-cased seed=123: name 'bucket_by_pieces' is not defined

=== Running bert-base-multilingual-cased (seed=456) ===


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed456.jsonl  (rows=15849)
[DONE] bert-base-multilingual-cased seed=456 → /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed456.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed456.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed456.jsonl') are the same file
[WARN] Diagnostics failed for bert-base-multilingual-cased seed=456: name 'bucket_by_pieces' is not defined

=== Running bert-base-multilingual-cased (seed=789) ===


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed789.jsonl  (rows=15859)
[DONE] bert-base-multilingual-cased seed=789 → /content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed789.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed789.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/bert-base-multilingual-cased/results_items_seed789.jsonl') are the same file
[WARN] Diagnostics failed for bert-base-multilingual-cased seed=789: name 'bucket_by_pieces' is not defined

=== Running xlm-roberta-base (seed=123) ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed123.jsonl  (rows=15868)
[DONE] xlm-roberta-base seed=123 → /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed123.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed123.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed123.jsonl') are the same file
[WARN] Diagnostics failed for xlm-roberta-base seed=123: name 'bucket_by_pieces' is not defined

=== Running xlm-roberta-base (seed=456) ===


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed456.jsonl  (rows=15866)
[DONE] xlm-roberta-base seed=456 → /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed456.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed456.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed456.jsonl') are the same file
[WARN] Diagnostics failed for xlm-roberta-base seed=456: name 'bucket_by_pieces' is not defined

=== Running xlm-roberta-base (seed=789) ===


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Wrote: /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed789.jsonl  (rows=15873)
[DONE] xlm-roberta-base seed=789 → /content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed789.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed789.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/xlm-roberta-base/results_items_seed789.jsonl') are the same file
[WARN] Diagnostics failed for xlm-roberta-base seed=789: name 'bucket_by_pieces' is not defined

=== Running google/rembert (seed=123) ===


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

sentencepiece.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

Some weights of RemBertForMaskedLM were not initialized from the model checkpoint at google/rembert and are newly initialized: ['cls.predictions.LayerNorm.bias', 'cls.predictions.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.dense.bias', 'cls.predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wrote: /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed123.jsonl  (rows=15871)
[DONE] google/rembert seed=123 → /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed123.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed123.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed123.jsonl') are the same file
[WARN] Diagnostics failed for google/rembert seed=123: name 'bucket_by_pieces' is not defined

=== Running google/rembert (seed=456) ===


Some weights of RemBertForMaskedLM were not initialized from the model checkpoint at google/rembert and are newly initialized: ['cls.predictions.LayerNorm.bias', 'cls.predictions.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.dense.bias', 'cls.predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wrote: /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed456.jsonl  (rows=15869)
[DONE] google/rembert seed=456 → /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed456.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed456.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed456.jsonl') are the same file
[WARN] Diagnostics failed for google/rembert seed=456: name 'bucket_by_pieces' is not defined

=== Running google/rembert (seed=789) ===


Some weights of RemBertForMaskedLM were not initialized from the model checkpoint at google/rembert and are newly initialized: ['cls.predictions.LayerNorm.bias', 'cls.predictions.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.dense.bias', 'cls.predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wrote: /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed789.jsonl  (rows=15876)
[DONE] google/rembert seed=789 → /content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed789.jsonl
[WARN] Could not copy JSONL for archival: '/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed789.jsonl' and PosixPath('/content/drive/MyDrive/MorphWO/outputs/google/rembert/results_items_seed789.jsonl') are the same file
[WARN] Diagnostics failed for google/rembert seed=789: name 'bucket_by_pieces' is not defined

Saved combined CSVs to: /content/drive/MyDrive/MorphWO/outputs/_combined

=== Cross-run sensitivities (balanced) ===
                   run_model         lang   S_full  S_partial   S_morph     I_add
bert-base-multilingual-cased       de_gsd 0.965936   0.409092  0.568256 -0.545546
bert-base-multilingual-cased       en_ewt 0.943102   0.443124  0.288736 -0.292802
bert-base-multilingual-cased    es_ancora 0.970968   0.438710  0.393548 -0.387097

NameError: name 'time' is not defined

In [None]:
# === POST-RUN ANALYSIS: aggregate across models & seeds, compute CIs & sensitivities ===
from pathlib import Path
import re, json, numpy as np, pandas as pd

# ---- Config ----
OUT_DIR = Path(params.out_dir)   # reuse from earlier cell
MODEL_DIRS = [p for p in (OUT_DIR).iterdir() if p.is_dir() and p.name not in {"_combined"}]

def _find_jsonls(model_dir: Path):
    return sorted(model_dir.glob("results_items_seed*.jsonl"))

def _parse_seed(path: Path) -> int | None:
    m = re.search(r"seed(\d+)", path.stem)
    return int(m.group(1)) if m else None

# Fallback summarizer, in case summarize_items_flat is not in scope
def _summarize_items_flat_fallback(jsonl_path: str, balance: bool = True) -> pd.DataFrame:
    rows = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            model = rec.get("model_name",""); lang = rec.get("lang",""); cond = rec.get("condition","")
            gold  = (rec.get("gold_word") or "").strip()
            pred  = (rec.get("reconstructed_prediction") or "").strip()
            topk  = [ (t or "").strip() for t in rec.get("top_k_predictions", [])[:5] ]
            wl = rec.get("word_level_match", None)
            word_at_1 = 1.0 if isinstance(wl, bool) and wl else float(pred.lower()==gold.lower())
            word_at_5 = float(any(gold.lower()==t.lower() for t in topk))
            rows.append({"model_name":model,"lang":lang,"condition":cond,
                         "word_at_1":word_at_1,"word_at_5":word_at_5})
    df = pd.DataFrame(rows)
    if df.empty: return df
    df_bal = df.copy()
    if balance:
        # downsample so each condition has equal N per (model,lang)
        chunks = []
        for (m,l), g in df_bal.groupby(["model_name","lang"], sort=False):
            counts = g.groupby("condition").size()
            if (counts==0).any(): continue
            n = int(counts.min())
            for cond, sub in g.groupby("condition", sort=False):
                chunks.append(sub.head(n))
        if chunks:
            df_bal = pd.concat(chunks, ignore_index=True)
    summary = (df_bal.groupby(["model_name","lang","condition"], sort=False)[["word_at_1","word_at_5"]]
               .mean().reset_index())
    Ns = (df_bal.groupby(["model_name","lang","condition"])["word_at_1"]
          .count().rename("N").reset_index())
    return summary.merge(Ns, on=["model_name","lang","condition"], how="left")

# Use your existing summarize_items_flat if present, else fallback
try:
    _ = summarize_items_flat
    summarize_fn = summarize_items_flat
except NameError:
    summarize_fn = _summarize_items_flat_fallback

def agg_mean_ci(df: pd.DataFrame, group_cols=("run_model","lang","condition")) -> pd.DataFrame:
    if df.empty: return df
    g = (df.groupby(list(group_cols), sort=False)["word_at_1"]
           .agg(["mean","std","count"])
           .reset_index()
           .rename(columns={"mean":"word_at_1_mean","std":"word_at_1_std","count":"runs"}))
    g["ci95"] = np.where(g["runs"]>=2, 1.96 * (g["word_at_1_std"] / np.sqrt(g["runs"])), np.nan)
    return g

def sensitivities_from_agg(agg_df: pd.DataFrame) -> pd.DataFrame:
    if agg_df.empty: return agg_df
    wide = (agg_df
            .pivot_table(index=["run_model","lang"], columns="condition", values="word_at_1_mean")
            .reset_index())
    def col(name): return wide[name] if name in wide.columns else pd.Series([np.nan]*len(wide))
    orig = col("Original"); den = orig.replace(0, np.nan)
    out = wide[["run_model","lang"]].copy()
    out["S_full"]    = (orig - col("FullyScrambled"))       / den
    out["S_partial"] = (orig - col("PartiallyScrambled"))   / den
    out["S_morph"]   = (orig - col("Original+Lemma"))       / den
    out["I_add"]     = ((orig - col("FullyScrambled+Lemma")) / den) - (out["S_full"] + out["S_morph"])
    return out

# ---- Ingest all runs (balanced & unbalanced) ----
all_bal, all_unb = [], []
for mdir in MODEL_DIRS:
    model_name = mdir.name
    jsonls = _find_jsonls(mdir)
    if not jsonls:
        print(f"[WARN] No JSONL found in {mdir}")
        continue
    for jp in jsonls:
        seed = _parse_seed(jp) or -1
        # summarize (balanced & unbalanced)
        bal   = summarize_fn(str(jp), balance=True)
        unbal = summarize_fn(str(jp), balance=False)
        for d in (bal, unbal):
            if not d.empty:
                d["run_model"] = model_name
                d["run_seed"]  = seed
        all_bal.append(bal); all_unb.append(unbal)

all_bal = pd.concat(all_bal, ignore_index=True) if all_bal else pd.DataFrame()
all_unb = pd.concat(all_unb, ignore_index=True) if all_unb else pd.DataFrame()

combo_dir = OUT_DIR / "_combined"
combo_dir.mkdir(parents=True, exist_ok=True)
all_bal.to_csv(combo_dir / "all_runs_balanced.csv", index=False)
all_unb.to_csv(combo_dir / "all_runs_unbalanced.csv", index=False)

print(f"\nCollected runs → balanced:{all_bal.shape}  unbalanced:{all_unb.shape}")

# ---- Aggregates & sensitivities across seeds ----
bal_agg = agg_mean_ci(all_bal, group_cols=("run_model","lang","condition"))
unb_agg = agg_mean_ci(all_unb, group_cols=("run_model","lang","condition"))
bal_agg.to_csv(combo_dir / "agg_balanced_mean_ci.csv", index=False)
unb_agg.to_csv(combo_dir / "agg_unbalanced_mean_ci.csv", index=False)

sens_bal = sensitivities_from_agg(bal_agg)
sens_unb = sensitivities_from_agg(unb_agg)
sens_bal.to_csv(combo_dir / "sensitivities_balanced_across_runs.csv", index=False)
sens_unb.to_csv(combo_dir / "sensitivities_unbalanced_across_runs.csv", index=False)

# ---- Quick printouts: Original acc & Sensitivities (balanced) ----
def _fmt_mean_ci(df, cond="Original"):
    sub = df[df["condition"]==cond].copy()
    if sub.empty: return pd.DataFrame()
    sub["mean_ci"] = sub.apply(
        lambda r: f"{r['word_at_1_mean']:.3f} ± {r['ci95']:.3f}" if pd.notna(r["ci95"]) else f"{r['word_at_1_mean']:.3f}",
        axis=1
    )
    return (sub[["run_model","lang","mean_ci"]]
            .sort_values(["lang","run_model"])
            .reset_index(drop=True))

orig_bal = _fmt_mean_ci(bal_agg, cond="Original")
print("\n=== Original word@1 (mean ± 95% CI) — balanced ===")
print(orig_bal.to_string(index=False) if not orig_bal.empty else "(no data)")

print("\n=== Sensitivities (balanced, mean over runs) ===")
print(sens_bal.sort_values(["run_model","lang"]).to_string(index=False) if not sens_bal.empty else "(no data)")

# ---- Optional: save compact tables ----
orig_bal.to_csv(combo_dir / "original_mean_ci_balanced.csv", index=False)

# (Optional) per-language model rank by Original
if not bal_agg.empty:
    rank_tbl = (bal_agg[bal_agg["condition"]=="Original"]
                .sort_values(["lang","word_at_1_mean"], ascending=[True,False]))
    rank_tbl.to_csv(combo_dir / "original_acc_ranks.csv", index=False)
    print("\nSaved per-language Original accuracy ranks.")


##PATCHES

In [None]:
# --- Load newest balanced summaries per model; add model_name; recompute metrics ---

import os, glob, json, pickle, zipfile
import pandas as pd
import numpy as np

# EDIT this to your path
BASE = "/content/drive/My Drive/MorphWO/outputs"   # or "/mnt/data/MorphWO/outputs"
MB_DIR   = os.path.join(BASE, "bert-base-multilingual-cased")
XLMR_DIR = os.path.join(BASE, "xlm-roberta-base")
COMB_DIR = os.path.join(BASE, "_combined")
SEARCH_ROOTS = [BASE, MB_DIR, XLMR_DIR, COMB_DIR]

def _rename_cond(c):
    mapping = {
        "Original":"Orig","original":"Orig","Orig":"Orig",
        "Original+Lemma":"Orig+M","Original+Morph":"Orig+M","Orig+Lemma":"Orig+M","Orig+M":"Orig+M",
        "FullyScrambled":"Full","FullScr":"Full","Full":"Full",
        "FullyScrambled+Lemma":"Full+M","Full+Lemma":"Full+M","Full+M":"Full+M",
        "PartiallyScrambled":"Part","PartScr":"Part","Part":"Part",
        "PartiallyScrambled+Lemma":"Part+M","Part+Lemma":"Part+M","Part+M":"Part+M",
        "HeadScrambled":"Head","HeadScr":"Head","Head":"Head",
        "HeadScrambled+Lemma":"Head+M","Head+Lemma":"Head+M","Head+M":"Head+M",
        "OrigCtx+Morph":"OrigCtx+M","FullCtx+Morph":"FullCtx+M","PartCtx+Morph":"PartCtx+M","HeadCtx+Morph":"HeadCtx+M",
    }
    return mapping.get(str(c), c)

def _standardize_columns(df, *, override_model=None):
    # Be tolerant about schema names
    ren = {
        "language":"lang","Language":"lang","lang_code":"lang",
        "model":"model_name","model_id":"model_name","checkpoint":"model_name","hf_model":"model_name",
        "cond":"condition","Condition":"condition","condition_name":"condition",
        "acc":"word_at_1","accuracy":"word_at_1","word_acc":"word_at_1","word_at_1":"word_at_1",
        "sent_id":"sentence_id","sid":"sentence_id","example_id":"sentence_id","item_id":"sentence_id","uid":"sentence_id","id":"sentence_id",
        "run":"seed",
    }
    df = df.rename(columns={k:v for k,v in ren.items() if k in df.columns})

    if "condition" in df.columns:
        df["condition"] = df["condition"].map(_rename_cond)

    if "word_at_1" not in df.columns:
        raise ValueError("Could not find accuracy column (expected 'word_at_1' or alias).")

    # supply missing sentence_id/seed if absent
    if "sentence_id" not in df.columns:
        df = df.copy()
        df["sentence_id"] = df.groupby([c for c in ["model_name","lang","condition"] if c in df.columns]).cumcount()
    if "seed" not in df.columns:
        df["seed"] = 0

    # If model_name is still missing, use override
    if "model_name" not in df.columns:
        if override_model is None:
            # last resort: single-model file → tag as 'unknown'
            override_model = "unknown"
        df["model_name"] = override_model

    # minimal required set
    req = {"model_name","lang","condition","word_at_1"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns even after standardization: {missing}")
    return df

def _newest(pattern):
    paths = glob.glob(pattern)
    return max(paths, key=os.path.getmtime) if paths else None

def load_balanced_from_subdirs():
    dfs = []
    mb = _newest(os.path.join(MB_DIR, "summary_balanced_*.csv"))
    xl = _newest(os.path.join(XLMR_DIR, "summary_balanced_*.csv"))
    used = []
    if mb:
        dfs.append(_standardize_columns(pd.read_csv(mb), override_model="bert-base-multilingual-cased"))
        used.append(("mBERT", mb, len(dfs[-1])))
    if xl:
        dfs.append(_standardize_columns(pd.read_csv(xl), override_model="xlm-roberta-base"))
        used.append(("XLM-R", xl, len(dfs[-1])))
    if dfs:
        print("Loaded per-model balanced summaries:")
        for tag,pth,n in used: print(f"  {tag}: {pth}  ({n} rows)")
        return pd.concat(dfs, ignore_index=True)

    # fallback: top-level file (inject 'unknown' model if necessary)
    top = _newest(os.path.join(BASE, "summary_balanced_*.csv"))
    if not top:
        raise RuntimeError("No summary_balanced_*.csv found in subdirs or BASE.")
    df = _standardize_columns(pd.read_csv(top), override_model="unknown")
    print(f"Loaded top-level balanced summary: {top}  ({len(df)} rows) — model_name injected as 'unknown'")
    return df

balanced_summary = load_balanced_from_subdirs()

# --------- Build (optional) unbalanced summary from any raw logs we can find ----------
def _read_any(path):
    low = path.lower()
    if low.endswith(".csv"): return pd.read_csv(path)
    if low.endswith(".parquet"): return pd.read_parquet(path)
    if low.endswith(".jsonl"):
        recs = []
        with open(path,"r",encoding="utf-8") as f:
            for line in f:
                s=line.strip()
                if s: recs.append(json.loads(s))
        return pd.json_normalize(recs)
    if low.endswith(".json"):
        with open(path,"r",encoding="utf-8") as f: data=json.load(f)
        return pd.json_normalize(data if isinstance(data,list) else [data])
    return None

RAW_PATTERNS = ["*.jsonl","*.csv","*.parquet"]
frames = []
for root in [MB_DIR, XLMR_DIR]:
    for patt in RAW_PATTERNS:
        for p in glob.glob(os.path.join(root, "**", patt), recursive=True):
            bn = os.path.basename(p)
            if bn.startswith("summary_balanced_"):  # skip summaries
                continue
            try:
                df = _read_any(p)
                if df is None or df.empty:
                    continue
                # Try to guess model from root
                override = "bert-base-multilingual-cased" if "bert-base-multilingual-cased" in root else "xlm-roberta-base"
                frames.append(_standardize_columns(df, override_model=override))
            except Exception:
                pass

if frames:
    raw = pd.concat(frames, ignore_index=True)
    unbalanced_summary = (raw.groupby(["model_name","lang","condition"], as_index=False)["word_at_1"].mean())
    print(f"Built unbalanced summary from raw: {len(raw)} rows → {len(unbalanced_summary)} (model,lang,cond) groups")
else:
    unbalanced_summary = None
    print("No raw per-item logs found for unbalanced summary; proceeding with balanced only.")

# --------- Sensitivities & Interactions in ACCURACY space ---------
def _pivot_acc(df):
    P = df.pivot_table(index=["model_name","lang"], columns="condition",
                       values="word_at_1", aggfunc="mean")
    P = P.rename(columns={c:_rename_cond(c) for c in P.columns})
    return P

def compute_sens_and_interactions_accuracy(summary_df):
    P = _pivot_acc(summary_df).copy()
    res = pd.DataFrame(index=P.index)

    def have(*cols): return all(c in P.columns for c in cols)
    if have("Orig","Full"): res["S_full"] = (P["Orig"] - P["Full"]) / P["Orig"]
    if have("Orig","Part"): res["S_part"] = (P["Orig"] - P["Part"]) / P["Orig"]
    if have("Orig","Head"): res["S_head"] = (P["Orig"] - P["Head"]) / P["Orig"]

    morph_col = "Orig+M" if "Orig+M" in P.columns else None
    if morph_col and have("Orig", morph_col):
        res["S_morph"] = (P["Orig"] - P[morph_col]) / P["Orig"]

    def inter(scr, both, name):
        if morph_col and have("Orig", scr, morph_col, both):
            res[name] = P[both] - (P[scr] + P[morph_col] - P["Orig"])

    inter("Full","Full+M","I_full")
    inter("Part","Part+M","I_part")
    inter("Head","Head+M","I_head")

    keep = [c for c in ["Orig","Full","Part","Head","Orig+M","Full+M","Part+M","Head+M"] if c in P.columns]
    out = res.join(P[keep], how="left")
    return out.reset_index()

sens_bal_acc = compute_sens_and_interactions_accuracy(balanced_summary)
print("\n=== Sensitivities & Interactions (balanced, ACCURACY space) ===")
print(sens_bal_acc.sort_values(["model_name","lang"]).to_string(index=False))

if unbalanced_summary is not None:
    sens_unb_acc = compute_sens_and_interactions_accuracy(unbalanced_summary)
    print("\n=== Sensitivities & Interactions (unbalanced, ACCURACY space) ===")
    print(sens_unb_acc.sort_values(["model_name","lang"]).to_string(index=False))

# Save CSVs for LaTeX \input{}
os.makedirs(COMB_DIR, exist_ok=True)
sens_bal_acc.to_csv(os.path.join(COMB_DIR, "sensitivities_bal_acc.csv"), index=False)
if unbalanced_summary is not None:
    sens_unb_acc.to_csv(os.path.join(COMB_DIR, "sensitivities_unbal_acc.csv"), index=False)


Loaded per-model balanced summaries:
  mBERT: /content/drive/My Drive/MorphWO/outputs/bert-base-multilingual-cased/summary_balanced_seed789.csv  (42 rows)
  XLM-R: /content/drive/My Drive/MorphWO/outputs/xlm-roberta-base/summary_balanced_seed789.csv  (42 rows)
Built unbalanced summary from raw: 378 rows → 84 (model,lang,cond) groups

=== Sensitivities & Interactions (balanced, ACCURACY space) ===
                  model_name         lang   S_full   S_part   S_head   S_morph    I_full    I_part     Orig     Full     Part     Head   Orig+M   Full+M   Part+M
bert-base-multilingual-cased       de_gsd 0.912281 0.473684 0.456140  0.578947  0.072500  0.040000 0.142500 0.012500 0.075000 0.077500 0.060000 0.002500 0.032500
bert-base-multilingual-cased       en_ewt 0.925000 0.437500 0.725000  0.237500  0.054645  0.019126 0.218579 0.016393 0.122951 0.060109 0.166667 0.019126 0.090164
bert-base-multilingual-cased    es_ancora 0.950000 0.430000 0.750000  0.430000  0.097744  0.050125 0.250627 0.0125

In [3]:
# ============================================================
# FINAL ANALYSIS BLOCK (no reruns; uses saved summaries only)
# - Scans .../outputs/* for summary_balanced.csv (fallback: summary_unbalanced.csv)
# - Normalizes schemas, computes Wilson CIs for accuracy
# - S/I via parametric bootstrap; Head vs Part delta
# - Saves *canonical* CSVs under outputs/_combined
# ============================================================
import os, glob, io, zipfile
import numpy as np
import pandas as pd

# -------- CONFIG --------
ROOT_OUT   = "/content/drive/MyDrive/MorphWO/outputs"
INPUT_GLOB = os.path.join(ROOT_OUT, "*")              # scan all subfolders
OUT_DIR    = os.path.join(ROOT_OUT, "_combined")      # canonical CSV output
N_BOOT     = 2000
RNG        = np.random.default_rng(1337)

CANON = {"Orig","Full","Part","Head","Orig+M","Full+M","Part+M","Full+M"}
COND_MAP = {
    "Original":"Orig", "Original+Lemma":"Orig+M",
    "FullyScrambled":"Full", "FullyScrambled+Lemma":"Full+M",
    "PartiallyScrambled":"Part", "PartiallyScrambled+Lemma":"Part+M",
    "HeadScrambled":"Head",
}

# -------- helpers --------
def canon_cond(x):
    if pd.isna(x): return None
    return COND_MAP.get(str(x).strip(), None)

def fix_acc_scale(p):
    try:
        p = float(p)
        return p/100.0 if (p>1.0 and p<=100.0) else p
    except Exception:
        return np.nan

def pick_first(df, names):
    for n in names:
        if n in df.columns and df[n].notna().any():
            return df[n]
    return pd.Series([np.nan]*len(df))

def wilson_ci(k,n):
    if n<=0: return (np.nan, np.nan, np.nan)
    p = k/n; z = 1.959963984540054
    denom = 1 + z*z/n
    center = (p + z*z/(2*n))/denom
    half = (z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)))/denom
    return (p, max(0.0, center-half), min(1.0, center+half))

def compute_SI_from_acc(accs):
    A = {k: accs.get(k, np.nan) for k in ["Orig","Full","Part","Head","Orig+M","Full+M"]}
    out = {}
    if not np.isnan(A["Orig"]):
        out["S_full"]  = (A["Orig"] - A["Full"]) / A["Orig"] if not np.isnan(A["Full"]) else np.nan
        out["S_part"]  = (A["Orig"] - A["Part"]) / A["Orig"] if not np.isnan(A["Part"]) else np.nan
        out["S_head"]  = (A["Orig"] - A["Head"]) / A["Orig"] if not np.isnan(A["Head"]) else np.nan
        out["S_morph"] = (A["Orig"] - A["Orig+M"]) / A["Orig"] if not np.isnan(A["Orig+M"]) else np.nan
    else:
        out = {k: np.nan for k in ["S_full","S_part","S_head","S_morph"]}
    out["I_full"] = (A["Full+M"] - (A["Full"] + A["Orig+M"] - A["Orig"])) \
                    if not any(np.isnan([A["Full+M"],A["Full"],A["Orig+M"],A["Orig"]])) else np.nan
    return out

def read_csv(path):
    try:
        return pd.read_csv(path)
    except Exception:
        return None

def load_summary_csvs(root_glob):
    """Collect summary_balanced.csv (fallback summary_unbalanced.csv) from folders/zips."""
    paths = glob.glob(root_glob, recursive=False)
    dfs = []

    # plain CSVs in subfolders
    for base in paths:
        for name in ("summary_balanced.csv","summary_unbalanced.csv"):
            p = os.path.join(base, name)
            if os.path.isfile(p):
                df = read_csv(p)
                if df is not None and not df.empty: dfs.append(df)

        # search deeper if needed
        for p in glob.glob(os.path.join(base, "**", "summary_balanced.csv"), recursive=True):
            df = read_csv(p)
            if df is not None and not df.empty: dfs.append(df)
        for p in glob.glob(os.path.join(base, "**", "summary_unbalanced.csv"), recursive=True):
            df = read_csv(p)
            if df is not None and not df.empty: dfs.append(df)

    # zipped bundles (optional)
    for z in glob.glob(os.path.join(root_glob, "*.zip")):
        try:
            with zipfile.ZipFile(z) as Z:
                names = Z.namelist()
                inner = next((n for n in names if n.endswith("/summary_balanced.csv")), None)
                if inner is None:
                    inner = next((n for n in names if n.endswith("/summary_unbalanced.csv")), None)
                if inner:
                    with Z.open(inner) as f:
                        df = pd.read_csv(io.TextIOWrapper(f, encoding="utf-8"))
                        dfs.append(df)
        except Exception:
            pass
    return dfs

def collect_summaries(root_glob):
    """Return tidy (lang, model, condition, N, acc) collapsed across runs with N-weighting."""
    raw = load_summary_csvs(root_glob)
    if not raw:
        return None

    rows = []
    for df in raw:
        # Normalize required cols
        if "model" not in df.columns:
            if "model_name" in df.columns:
                df["model"] = df["model_name"]
            else:
                df["model"] = "UNK_MODEL"
        if "acc" not in df.columns:
            if "word_at_1" in df.columns:
                df["acc"] = df["word_at_1"]
            elif "word_at_5" in df.columns:
                df["acc"] = df["word_at_5"]
        if "N" not in df.columns or df["N"].isna().all():
            df["N"] = pick_first(df, ["N","n","count","num_items","samples","N_bal"])

        need = {"lang","model","condition","N","acc"}
        if not need.issubset(df.columns):
            continue

        d = df[list(need)].copy()
        d["condition"] = d["condition"].map(canon_cond)
        d = d[d["condition"].isin(CANON)]
        d["N"]   = pd.to_numeric(d["N"], errors="coerce")
        d["acc"] = pd.to_numeric(d["acc"], errors="coerce").apply(fix_acc_scale)
        d = d[d["N"].notna() & d["acc"].notna() & (d["N"]>0)]
        if d.empty:
            continue
        rows.append(d)

    if not rows:
        return None

    all_df = pd.concat(rows, ignore_index=True)
    g = (all_df.groupby(["lang","model","condition"], as_index=False)
                 .apply(lambda x: pd.Series({
                     "N": float(x["N"].sum()),
                     "acc": float(np.average(x["acc"], weights=x["N"]))
                 }),
                        include_groups=False)
                 .reset_index(drop=True))
    return g

# -------- MAIN --------
agg = collect_summaries(INPUT_GLOB)
if agg is None or agg.empty:
    raise ValueError(f"[fatal] No usable summaries found under {INPUT_GLOB}")

print("[OK] Summaries loaded. First 12 rows:")
print(agg.sort_values(["lang","model","condition"]).head(12).to_string(index=False))

records_acc, records_sens, records_inter, records_perm = [], [], [], []

for (lg, mdl), g in agg.groupby(["lang","model"]):
    g2 = g.set_index("condition")

    # Accuracy with Wilson CI
    acc_point = {}
    for cond in g2.index:
        n = float(g2.loc[cond,"N"]); p = float(g2.loc[cond,"acc"])
        k = int(round(n*p))
        m, lo, hi = wilson_ci(k, n)
        records_acc.append({
            "lang": lg, "model": mdl, "condition": cond,
            "acc": round(m,6), "acc_lo": round(lo,6), "acc_hi": round(hi,6), "N": int(n)
        })
        acc_point[cond] = m

    # Parametric bootstrap for S/I and Head vs Part delta
    boots_SI, boots_delta = [], []
    for _ in range(N_BOOT):
        accs_b = {}
        for cond in ["Orig","Full","Part","Head","Orig+M","Full+M","Part+M"]:
            if cond in g2.index:
                n = int(g2.loc[cond,"N"]); p = float(g2.loc[cond,"acc"])
                x = RNG.binomial(n, p) if n>0 else 0
                accs_b[cond] = (x / n) if n>0 else np.nan
        boots_SI.append(compute_SI_from_acc(accs_b))
        if all(c in accs_b for c in ["Orig","Head","Part"]):
            # Head – Part (positive => Head hurts more)
            d = (accs_b.get("Head", np.nan) - accs_b.get("Part", np.nan))
            boots_delta.append(d)

    if boots_SI:
        boots_df = pd.DataFrame(boots_SI)
        si_point = compute_SI_from_acc(acc_point)

        rec_s = {"lang":lg,"model":mdl}
        for k in ["S_full","S_part","S_head","S_morph"]:
            rec_s[k] = round(si_point.get(k, np.nan),6)
            rec_s[k+"_lo"] = round(np.nanpercentile(boots_df[k],2.5),6) if k in boots_df else np.nan
            rec_s[k+"_hi"] = round(np.nanpercentile(boots_df[k],97.5),6) if k in boots_df else np.nan
        records_sens.append(rec_s)

        rec_i = {"lang":lg,"model":mdl,"I_full":round(si_point.get("I_full", np.nan),6)}
        if "I_full" in boots_df:
            rec_i["I_full_lo"] = round(np.nanpercentile(boots_df["I_full"],2.5),6)
            rec_i["I_full_hi"] = round(np.nanpercentile(boots_df["I_full"],97.5),6)
        else:
            rec_i["I_full_lo"] = rec_i["I_full_hi"] = np.nan
        records_inter.append(rec_i)

    if boots_delta:
        d = np.array(boots_delta, dtype=float)
        delta_pt = float(np.nanmean(d))
        p_two = float((np.sum(np.abs(d) >= abs(delta_pt)) + 1) / (len(d) + 1))
        records_perm.append({"lang":lg,"model":mdl,"delta":delta_pt,"p_perm":p_two})

# -------- OUTPUT TABLES --------
acc_tbl   = pd.DataFrame.from_records(records_acc).sort_values(["lang","model","condition"]) if records_acc else pd.DataFrame()
sens_tbl  = pd.DataFrame.from_records(records_sens).sort_values(["lang","model"]) if records_sens else pd.DataFrame()
inter_tbl = pd.DataFrame.from_records(records_inter).sort_values(["lang","model"]) if records_inter else pd.DataFrame()
perm_tbl  = pd.DataFrame.from_records(records_perm).sort_values(["lang","model"]) if records_perm else pd.DataFrame()

os.makedirs(OUT_DIR, exist_ok=True)
if not acc_tbl.empty:   acc_tbl.to_csv(os.path.join(OUT_DIR,"acc_ci_from_summaries.csv"), index=False)
if not sens_tbl.empty:  sens_tbl.to_csv(os.path.join(OUT_DIR,"sens_ci_from_summaries.csv"), index=False)
if not inter_tbl.empty: inter_tbl.to_csv(os.path.join(OUT_DIR,"interaction_ci_from_summaries.csv"), index=False)
if not perm_tbl.empty:  perm_tbl.to_csv(os.path.join(OUT_DIR,"perm_head_vs_part_from_summaries.csv"), index=False)

with pd.option_context("display.max_rows", 200, "display.max_columns", 100):
    print("\n=== Accuracy (Wilson 95% CI) ===");        print(acc_tbl if not acc_tbl.empty else "—")
    print("\n=== Sensitivities S (parametric 95% CI) ===");  print(sens_tbl if not sens_tbl.empty else "—")
    print("\n=== Interaction I_full (parametric 95% CI) ==="); print(inter_tbl if not inter_tbl.empty else "—")
    if not perm_tbl.empty:
        print("\n=== Head vs Part (parametric; Δ = Head – Part) ==="); print(perm_tbl)

print(f"\n[done] Saved canonical CSVs to {OUT_DIR}")


[OK] Summaries loaded. First 12 rows:
  lang                        model condition      N      acc
de_gsd bert-base-multilingual-cased      Full 1200.0 0.000000
de_gsd bert-base-multilingual-cased    Full+M 1200.0 0.003333
de_gsd bert-base-multilingual-cased      Head 1200.0 0.090000
de_gsd bert-base-multilingual-cased      Orig 1200.0 0.146667
de_gsd bert-base-multilingual-cased    Orig+M 1200.0 0.066667
de_gsd bert-base-multilingual-cased      Part 1200.0 0.083333
de_gsd bert-base-multilingual-cased    Part+M 1200.0 0.036667
de_gsd             xlm-roberta-base      Full  760.0 0.005263
de_gsd             xlm-roberta-base    Full+M  760.0 0.002632
de_gsd             xlm-roberta-base      Head  760.0 0.078947
de_gsd             xlm-roberta-base      Orig  760.0 0.228947
de_gsd             xlm-roberta-base    Orig+M  760.0 0.094737

=== Accuracy (Wilson 95% CI) ===
            lang                         model condition       acc    acc_lo  \
0         de_gsd  bert-base-multilingual-c

In [4]:
# ============================
# FINAL ANALYSIS BLOCK (aligned & saving)
# - Reads canonical CSVs from _combined/
# - Writes final PNGs to outputs/figs/
# ============================
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

COMBINED_DIR = "/content/drive/MyDrive/MorphWO/outputs/_combined"  # canonical CSVs
OUT_FIGS     = "/content/drive/MyDrive/MorphWO/outputs/figs"       # final figures
os.makedirs(OUT_FIGS, exist_ok=True)

def safe_read(fname):
    path = os.path.join(COMBINED_DIR, fname)
    if not os.path.exists(path):
        print(f"[warn] Missing {fname} in {COMBINED_DIR}")
        return None
    return pd.read_csv(path)

def save_table(df, fname):
    path = os.path.join(COMBINED_DIR, fname)
    df.to_csv(path, index=False)
    print(f"[ok] Saved table {path}")

def plot_with_ci(df, x, y, lo, hi, hue=None, title=None, fname=None, ylabel=None):
    plt.figure(figsize=(8,6))
    # join=False is deprecated warning in newer seaborn; still safe.
    sns.pointplot(data=df, x=x, y=y, hue=hue, dodge=True, join=False,
                  errorbar=None, markers="o")
    # manual error bars
    for _, row in df.iterrows():
        plt.errorbar(x=row[x], y=row[y],
                     yerr=[[row[y] - row[lo]], [row[hi] - row[y]]],
                     fmt="none", c="black", capsize=3)
    if ylabel: plt.ylabel(ylabel)
    if title:  plt.title(title)
    plt.tight_layout()
    if fname:
        out = os.path.join(OUT_FIGS, fname)
        plt.savefig(out, dpi=200, bbox_inches="tight")
        print(f"[ok] Saved fig {out}")
    plt.close()

# ---------- Accuracy ----------
acc = safe_read("acc_ci_from_summaries.csv")
if acc is not None and not acc.empty:
    print("\n[acc] head:\n", acc.head())
    save_table(acc, "acc_ci_clean.csv")
    plot_with_ci(acc, x="condition", y="acc", lo="acc_lo", hi="acc_hi",
                 hue="model", title="Accuracy by Condition",
                 fname="accuracy_ci.png", ylabel="Accuracy")

# ---------- Sensitivities ----------
sens = safe_read("sens_ci_from_summaries.csv")
if sens is not None and not sens.empty:
    print("\n[sens] head:\n", sens.head())
    save_table(sens, "sens_ci_clean.csv")

    # wide ->  long for plotting
    conds = ["S_full", "S_part", "S_head", "S_morph"]
    rows = []
    for _, r in sens.iterrows():
        for c in conds:
            rows.append({
                "lang": r["lang"],
                "model": r["model"],
                "condition": c.replace("S_", ""),  # full/part/head/morph
                "mean": r[c],
                "lo": r[c + "_lo"],
                "hi": r[c + "_hi"]
            })
    sens_long = pd.DataFrame(rows)
    print("\n[sens_long] head:\n", sens_long.head())
    save_table(sens_long, "sens_ci_long.csv")

    plot_with_ci(sens_long, x="condition", y="mean", lo="lo", hi="hi",
                 hue="model", title="Sensitivity by Condition",
                 fname="sensitivity_ci.png", ylabel="Relative drop (S)")

# ---------- Interaction ----------
inter = safe_read("interaction_ci_from_summaries.csv")
if inter is not None and not inter.empty:
    print("\n[interactions] head:\n", inter.head())
    save_table(inter, "interaction_ci_clean.csv")

    inter_long = pd.DataFrame([{
        "lang": r["lang"], "model": r["model"],
        "contrast": "full", "mean": r["I_full"], "lo": r["I_full_lo"], "hi": r["I_full_hi"]
    } for _, r in inter.iterrows()])

    print("\n[inter_long] head:\n", inter_long.head())
    save_table(inter_long, "interaction_ci_long.csv")

    plot_with_ci(inter_long, x="contrast", y="mean", lo="lo", hi="hi",
                 hue="model", title="Interaction Effects",
                 fname="interaction_ci.png", ylabel="I_full")

# ---------- Permutation test: Head vs Part (Δ = Head − Part) ----------
perm = safe_read("perm_head_vs_part_from_summaries.csv")
if perm is not None and not perm.empty:
    print("\n[perm] head:\n", perm.head())
    plt.figure(figsize=(10,6))
    sns.pointplot(data=perm, x="lang", y="delta", hue="model", dodge=True,
                  join=False, errorbar=None, markers="o")
    plt.axhline(0, color="black", linestyle="--", linewidth=1)
    plt.title("Permutation Test: Head vs. Partially Scrambled")
    plt.ylabel("Δ Accuracy (Part – Head)")
    out_path = os.path.join(OUT_FIGS, "perm_head_vs_part.png")
    plt.savefig(out_path, bbox_inches="tight", dpi=200)
    print(f"[ok] Saved fig {out_path}")
    plt.close()

print("\n[done] Final analysis complete.")



[acc] head:
      lang                         model condition       acc    acc_lo  \
0  de_gsd  bert-base-multilingual-cased      Full  0.000000  0.000000   
1  de_gsd  bert-base-multilingual-cased    Full+M  0.003333  0.001297   
2  de_gsd  bert-base-multilingual-cased      Head  0.090000  0.075089   
3  de_gsd  bert-base-multilingual-cased      Orig  0.146667  0.127778   
4  de_gsd  bert-base-multilingual-cased    Orig+M  0.066667  0.053891   

     acc_hi     N  
0  0.003191  1200  
1  0.008539  1200  
2  0.107527  1200  
3  0.167810  1200  
4  0.082208  1200  
[ok] Saved table /content/drive/MyDrive/MorphWO/outputs/_combined/acc_ci_clean.csv



The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=df, x=x, y=y, hue=hue, dodge=True, join=False,


[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/accuracy_ci.png

[sens] head:
         lang                         model    S_full  S_full_lo  S_full_hi  \
0     de_gsd  bert-base-multilingual-cased  1.000000   1.000000   1.000000   
1     de_gsd              xlm-roberta-base  0.977011   0.950276   0.994766   
2     en_ewt  bert-base-multilingual-cased  0.969231   0.945736   0.988555   
3     en_ewt              xlm-roberta-base  0.967391   0.938547   0.989849   
4  es_ancora  bert-base-multilingual-cased  0.950000   0.920187   0.976096   

     S_part  S_part_lo  S_part_hi    S_head  S_head_lo  S_head_hi   S_morph  \
0  0.431818   0.280996   0.550265  0.386364   0.234129   0.517949  0.545455   
1  0.712644   0.615374   0.794286  0.655172   0.552142   0.745354  0.586207   
2  0.476923   0.363220   0.571970  0.692308   0.610037   0.757353  0.369231   
3  0.565217   0.447198   0.658038  0.695652   0.599974   0.775401  0.358696   
4  0.383333   0.255581   0.489883  0.666667   


The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=df, x=x, y=y, hue=hue, dodge=True, join=False,


[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/sensitivity_ci.png

[interactions] head:
         lang                         model    I_full  I_full_lo  I_full_hi
0     de_gsd  bert-base-multilingual-cased  0.083333   0.058333   0.106667
1     de_gsd              xlm-roberta-base  0.131579   0.096053   0.169737
2     en_ewt  bert-base-multilingual-cased  0.082759   0.050862   0.113793
3     en_ewt              xlm-roberta-base  0.081579   0.040757   0.121086
4  es_ancora  bert-base-multilingual-cased  0.083612   0.055184   0.114548
[ok] Saved table /content/drive/MyDrive/MorphWO/outputs/_combined/interaction_ci_clean.csv

[inter_long] head:
         lang                         model contrast      mean        lo  \
0     de_gsd  bert-base-multilingual-cased     full  0.083333  0.058333   
1     de_gsd              xlm-roberta-base     full  0.131579  0.096053   
2     en_ewt  bert-base-multilingual-cased     full  0.082759  0.050862   
3     en_ewt              xlm-roberta


The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=df, x=x, y=y, hue=hue, dodge=True, join=False,


[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/interaction_ci.png

[perm] head:
         lang                         model     delta    p_perm
0     de_gsd  bert-base-multilingual-cased  0.006511  0.629185
1     de_gsd              xlm-roberta-base  0.012840  0.540730
2     en_ewt  bert-base-multilingual-cased -0.048105  0.503248
3     en_ewt              xlm-roberta-base -0.032164  0.494253
4  es_ancora  bert-base-multilingual-cased -0.056944  0.479760



The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=perm, x="lang", y="delta", hue="model", dodge=True,


[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/perm_head_vs_part.png

[done] Final analysis complete.


In [6]:
# ---------- Extra plots: exclude TR, and TR-only ----------
if perm is not None and not perm.empty:
    # (Optional) ensure the sign matches your label.
    # If delta was computed as Head − Part but the y-label says Part − Head, uncomment:
    # perm["delta"] = -perm["delta"]

    # --- exclude TR ---
    perm_no_tr = perm[perm["lang"] != "tr_imst"].copy()
    if not perm_no_tr.empty:
        # keep a stable language order
        lang_order = [l for l in ["de_gsd","en_ewt","es_ancora","ru_syntagrus","zh_gsd"] if l in perm_no_tr["lang"].unique()]
        if lang_order:
            perm_no_tr["lang"] = pd.Categorical(perm_no_tr["lang"], categories=lang_order, ordered=True)

        plt.figure(figsize=(10,6))
        sns.pointplot(data=perm_no_tr, x="lang", y="delta", hue="model", dodge=True,
                      join=False, errorbar=None, markers="o")
        plt.axhline(0, color="black", linestyle="--", linewidth=1)
        plt.title("Permutation Test : Head vs. Partially Scrambled")
        plt.ylabel("Δ Accuracy (Part – Head)")
        out_path = os.path.join(OUT_FIGS, "perm_head_vs_part_noTR.png")
        plt.savefig(out_path, bbox_inches="tight", dpi=200)
        print(f"[ok] Saved fig {out_path}")
        plt.close()

    # --- TR only ---
    perm_tr = perm[perm["lang"] == "tr_imst"].copy()
    if not perm_tr.empty:
        # with a single language, show deltas by model on the x-axis
        plt.figure(figsize=(6,6))
        sns.pointplot(data=perm_tr, x="model", y="delta", dodge=True,
                      join=False, errorbar=None, markers="o")
        plt.axhline(0, color="black", linestyle="--", linewidth=1)
        plt.title("Permutation Test: Turkish (tr_imst) Only")
        plt.ylabel("Δ Accuracy (Part – Head)")
        out_path = os.path.join(OUT_FIGS, "perm_head_vs_part_TR.png")
        plt.savefig(out_path, bbox_inches="tight", dpi=200)
        print(f"[ok] Saved fig {out_path}")
        plt.close()



The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=perm_no_tr, x="lang", y="delta", hue="model", dodge=True,


[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/perm_head_vs_part_noTR.png
[ok] Saved fig /content/drive/MyDrive/MorphWO/outputs/figs/perm_head_vs_part_TR.png



The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(data=perm_tr, x="model", y="delta", dodge=True,
