# AMOEval — End-to-end Colab Notebook (Template → Filling → Judging → Stats)

This notebook runs the full AMOEval pipeline from raw inputs:

1) **Template induction** from training reports (LLM reads reports → outputs a fixed multi-level entity template).  
2) **Optional report generation** from images (+keywords) using one or more multimodal generators.  
3) **Two-step template filling** (select Level-1 categories → fill Level-2/3 entities with evidence spans).  
4) **Entity-wise judging** for each doctor/model report pair with a 4-way taxonomy (**Aligned / Mismatched / Omitted / Extra**).  
5) **Scope-map analysis** (IS vs OOS) + summary tables exported for the paper.

**Anonymity note (MICCAI)**: do not include author names, emails, affiliations, or private dataset links in this notebook or the repository.


In [None]:
# Install dependencies (Colab)
!pip -q install openai pandas tqdm pillow openpyxl

In [None]:
import os, json, re, base64, random, textwrap
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
from tqdm import tqdm
from PIL import Image

from openai import OpenAI

# IMPORTANT: do NOT hardcode API keys.
# In Colab, you can paste your key when prompted, or set it as a secret/environment variable.
if "OPENAI_API_KEY" not in os.environ:
    import getpass
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OPENAI_API_KEY: ")

client = OpenAI()

In [None]:
# =========================
# 0) USER CONFIGURATION
# =========================

# Required inputs (only these two are required):
IMAGES_ROOT = Path("/content/images")          # folder containing images (e.g., .png/.jpg)
REPORT_JSON_PATH = Path("/content/reports.json")  # one JSON file containing reports + image paths (+ optional keywords/splits)

# Outputs
OUTPUT_DIR = Path("/content/amo_eval_outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Input contract (edit to match your evaluation setting)
# - "image_only": model sees only images
# - "image_keywords": model sees images + keywords (recommended for DeepEyeNet setting described in the paper)
INPUT_CONTRACT = "image_keywords"

# Fixed evaluator (used for template induction + filling + judging)
EVAL_MODEL_TEXT = "gpt-5.2"

# Optional multimodal generators (used only if you run the generation stage)
# (You can also skip generation and provide MODEL_REPORTS_JSONL yourself.)
GENERATOR_MODELS = ["gpt-5.2", "gpt-5-mini", "o4-mini"]

# Controls (reduce for quick sanity checks; increase for paper-scale runs)
RANDOM_SEED = 7
random.seed(RANDOM_SEED)

MAX_TRAIN_REPORTS_FOR_TEMPLATE = 800   # number of training reports sampled for template induction
MAX_TEST_SAMPLES = None               # e.g., 200 for quick runs; None = full test split

# File paths
TEMPLATE_JSON = OUTPUT_DIR / "amo_eval_template.json"
MODEL_REPORTS_JSONL = OUTPUT_DIR / "model_reports.jsonl"      # generated (or user-provided) model reports
DOCTOR_FILLED_JSONL = OUTPUT_DIR / "doctor_filled.jsonl"
MODEL_FILLED_JSONL = OUTPUT_DIR / "model_filled.jsonl"
SEMANTIC_CSV = OUTPUT_DIR / "semantic_sets_full.csv"
SUMMARY_XLSX = OUTPUT_DIR / "label_stats_summary.xlsx"

# Scope map (editable): prefixes treated as Out-of-Scope (OOS) under the chosen input contract.
# Everything else is treated as In-Scope (IS).
# NOTE: this is an explicit *user decision* that encodes the input contract.
OOS_PREFIXES = [
    "patient.demographics",
    "patient.history",
    "ocular.exam.visual_acuity",
    "treatments",
    "clinical_course",
]

In [None]:
# =========================
# 1) LOAD DATA
# =========================

def _first_nonempty(d: Dict[str, Any], keys: List[str]) -> Optional[Any]:
    for k in keys:
        if k in d and d[k] not in [None, "", []]:
            return d[k]
    return None

def load_report_json(report_json_path: Path) -> pd.DataFrame:
    """Load a single JSON file that contains {image path, doctor report, optional keywords, optional split}.
    Supports several common schemas to reduce friction when sharing anonymized code.
    """
    with open(report_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    records: List[Dict[str, Any]] = []

    def add_item(item: Dict[str, Any], split_override: Optional[str] = None, fallback_id: Optional[str] = None):
        img_rel = _first_nonempty(item, ["image_rel", "image_relpath", "image_path", "image", "img", "filepath", "path"])
        report = _first_nonempty(item, ["doctor_report", "reference_report", "report", "text", "report_text", "gt_report"])
        keywords = _first_nonempty(item, ["keywords", "keyword", "tags", "key_words"])
        split = split_override or _first_nonempty(item, ["split", "set", "subset"]) or "unknown"
        sid = _first_nonempty(item, ["sample_id", "id", "uid", "study_id", "case_id"]) or fallback_id

        if img_rel is None or report is None:
            return  # skip malformed rows silently

        records.append({
            "sample_id": str(sid) if sid is not None else f"sample_{len(records)}",
            "split": str(split).lower(),
            "image_rel": str(img_rel),
            "keywords": "" if keywords is None else (keywords if isinstance(keywords, str) else " ".join(map(str, keywords))),
            "doctor_report": str(report),
        })

    # Schema A: dict with train/val/test lists
    if isinstance(data, dict) and any(k in data for k in ["train", "test", "val", "valid", "validation", "dev"]):
        for k in ["train", "val", "valid", "validation", "dev", "test"]:
            if k in data and isinstance(data[k], list):
                split = "val" if k in ["val", "valid", "validation", "dev"] else k
                for j, item in enumerate(data[k]):
                    if isinstance(item, dict):
                        add_item(item, split_override=split, fallback_id=f"{split}_{j}")
    # Schema B: dict with a 'data' list
    elif isinstance(data, dict) and "data" in data and isinstance(data["data"], list):
        for j, item in enumerate(data["data"]):
            if isinstance(item, dict):
                add_item(item, fallback_id=f"row_{j}")
    # Schema C: list of dicts
    elif isinstance(data, list):
        for j, item in enumerate(data):
            if isinstance(item, dict):
                add_item(item, fallback_id=f"row_{j}")
    # Schema D: dict mapping id -> dict
    elif isinstance(data, dict):
        for j, (k, item) in enumerate(data.items()):
            if isinstance(item, dict):
                item2 = dict(item)
                item2.setdefault("id", k)
                add_item(item2, fallback_id=str(k))
    else:
        raise ValueError("Unsupported REPORT_JSON schema. Please provide a list/dict of samples.")

    df = pd.DataFrame(records)
    if df.empty:
        raise ValueError("No valid samples found. Check REPORT_JSON_PATH and field names.")

    # If split is missing for all rows, default everything to 'test'
    if df["split"].nunique() == 1 and df["split"].iloc[0] in ["unknown", "none", ""]:
        df["split"] = "test"

    # Canonicalize split names
    df["split"] = df["split"].replace({"validation":"val", "valid":"val", "dev":"val"})

    return df

df = load_report_json(REPORT_JSON_PATH)

# If no train split exists, create a deterministic 80/20 split so template induction can run.
if "train" not in set(df["split"]):
    df = df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)
    cut = int(0.8 * len(df))
    df.loc[:cut-1, "split"] = "train"
    df.loc[cut:, "split"] = "test"
    print("[WARN] No 'train' split found in REPORT_JSON; created an 80/20 split for template induction vs evaluation.")

# Resolve image paths
df["image_path"] = df["image_rel"].apply(lambda p: str(IMAGES_ROOT / p))

print("Loaded samples:", len(df))
print(df["split"].value_counts())

df_train = df[df["split"] == "train"].reset_index(drop=True)
df_test = df[df["split"] == "test"].reset_index(drop=True)

if MAX_TEST_SAMPLES is not None:
    df_test = df_test.iloc[:MAX_TEST_SAMPLES].copy()

df_train.head(2), df_test.head(2)

## Stage I — Template induction (fixed multi-level entity template)

The evaluator reads a subset of training reports and produces a fixed slot catalog (entity template).  
Later stages must *reuse* this template; no new slots are allowed during filling.


In [None]:
# =========================
# 2) STAGE I: TEMPLATE INDUCTION
# =========================

INDUCE_SYSTEM = (
    "You are a medical NLP ontology induction system for ophthalmology clinical narrative reports. "
    "Your job is to design a fixed, reusable, fine-grained template (slot catalog) that can cover the entire corpus."
)

INDUCE_USER_BASE = """You will be given a corpus of ophthalmology clinical-description texts (free-text narratives).

Goal:
- Induce a FIXED template (slot catalog) that can represent ALL information in the corpus.
- Slot set must be stable and reusable across the dataset. Later extraction MUST NOT add new slots.

Hard requirements:
1) Coverage: any new report from the same dataset should be representable without inventing new fields.
2) Stability: the slot set is fixed; missing info is null/unknown, never new slots.
3) Fine granularity: prefer atomic facts (symptoms, exam findings, imaging, labs, treatments, response, follow-up timeline, referrals).
4) Uncertainty/diagnostic status must be representable: suspected/confirmed/ruled_out/unknown etc.
5) Include explicit support for multi-modal tests mentioned in text (e.g., FA/OCT/fundus photo, MRI/CT, LP/labs, etc.), referral chain, and treatment response/adjustments/recurrence.

Slot naming conventions:
- Use dot-paths (e.g., section.subsection.item), snake_case.
- For repeating items, use [] in path if needed (e.g., imaging.studies[].modality).
- Provide a few 'other_*' catch-all slots so that truly rare details are still captured without new slots.

Output:
Return JSON only with:
- template_name, template_version
- global_conventions: missing_value_policy, status_enum, certainty_enum, evidence_policy
- slots: >100 slots, each slot includes: path, type, description, allowed_values (optional), example_phrases (1–2 short)
- coverage_notes
"""

UPDATE_USER_BASE = """You are updating an existing FIXED slot catalog for the same dataset.

Rules (append-only):
- You may ONLY append NEW slots if strictly necessary to cover the NEW corpus chunk.
- Do NOT delete, rename, reorder, or modify existing slots (paths, types, descriptions).
- Output the FULL updated template JSON only.

Existing template JSON:
{existing_template_json}

New corpus chunk:
{new_corpus}
"""

def _strip_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = re.sub(r"^```[a-zA-Z]*\n", "", s).strip()
        s = re.sub(r"```\s*$", "", s).strip()
    return s

def call_llm_json(model: str, system: str, user: str, max_output_tokens: int = 8000) -> Dict[str, Any]:
    resp = client.responses.create(
        model=model,
        input=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0,
        top_p=1,
        max_output_tokens=max_output_tokens,
    )
    txt = _strip_fences(resp.output_text)
    try:
        return json.loads(txt)
    except Exception as e:
        raise RuntimeError(f"JSON parse failed: {e}\n\nRaw output:\n{txt[:2000]}")

def enforce_append_only(old_t: Dict[str, Any], new_t: Dict[str, Any]) -> Dict[str, Any]:
    """Keep ALL old slots unchanged; only append truly new slots from new_t."""
    old_slots = old_t.get("slots", [])
    new_slots = new_t.get("slots", [])
    old_by_path = {s.get("path"): s for s in old_slots if isinstance(s, dict) and s.get("path")}
    merged = list(old_by_path.values())

    for s in new_slots:
        if not isinstance(s, dict):
            continue
        p = s.get("path")
        if not p or p in old_by_path:
            continue
        merged.append(s)

    merged = sorted(merged, key=lambda x: x.get("path", ""))

    out = dict(old_t)
    out["slots"] = merged

    # keep global fields from old if present
    for k in ["template_name", "template_version", "global_conventions", "coverage_notes"]:
        if k in old_t:
            out[k] = old_t[k]
        elif k in new_t:
            out[k] = new_t[k]
    return out

def chunk_texts(texts: List[str], max_chars: int = 120000) -> List[str]:
    chunks=[]
    cur=[]
    cur_len=0
    for t in texts:
        t = t.strip()
        if not t:
            continue
        add_len = len(t) + 5
        if cur and cur_len + add_len > max_chars:
            chunks.append("\n\n---\n\n".join(cur))
            cur=[t]
            cur_len=len(t)
        else:
            cur.append(t)
            cur_len += add_len
    if cur:
        chunks.append("\n\n---\n\n".join(cur))
    return chunks

def induce_template_from_reports(train_reports: List[str]) -> Dict[str, Any]:
    chunks = chunk_texts(train_reports, max_chars=120000)
    template = None

    for i, corpus_chunk in enumerate(chunks):
        if template is None:
            user = INDUCE_USER_BASE + "\n\nCorpus:\n" + corpus_chunk
            template = call_llm_json(EVAL_MODEL_TEXT, INDUCE_SYSTEM, user, max_output_tokens=12000)
        else:
            user = UPDATE_USER_BASE.format(
                existing_template_json=json.dumps(template, ensure_ascii=False),
                new_corpus=corpus_chunk
            )
            updated = call_llm_json(EVAL_MODEL_TEXT, INDUCE_SYSTEM, user, max_output_tokens=12000)
            template = enforce_append_only(template, updated)

        print(f"[Template induction] processed chunk {i+1}/{len(chunks)}; slots={len(template.get('slots', []))}")

    # minimal normalization
    template.setdefault("template_name", "AMOEvalTemplate")
    template.setdefault("template_version", "v1")
    template["slots"] = sorted(template.get("slots", []), key=lambda x: x.get("path", ""))

    return template

# Run / load template
if TEMPLATE_JSON.exists():
    print(f"Using existing template: {TEMPLATE_JSON}")
    template = json.loads(TEMPLATE_JSON.read_text(encoding='utf-8'))
else:
    train_reports = df_train["doctor_report"].dropna().tolist()
    random.shuffle(train_reports)
    train_reports = train_reports[:MAX_TRAIN_REPORTS_FOR_TEMPLATE]
    print(f"Inducing template from {len(train_reports)} training reports...")
    template = induce_template_from_reports(train_reports)
    TEMPLATE_JSON.write_text(json.dumps(template, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Saved template to {TEMPLATE_JSON}")

print("Template slots:", len(template.get('slots', [])))
print("Example slot paths:", [s.get('path') for s in template.get('slots', [])[:10]])

## Stage II — Optional report generation (images + keywords → model report)

If you already have generated reports, you can skip this stage and place your own `model_reports.jsonl` at `MODEL_REPORTS_JSONL`.
Each JSONL row must include: `sample_id`, `model_tag`, `report_text` (and optionally `image_rel`, `keywords`).


In [None]:
# =========================
# 3) STAGE II (OPTIONAL): GENERATE MODEL REPORTS
# =========================

GEN_SYSTEM = (
    "You are an ophthalmology report generation system. "
    "If a requested detail is not supported, omit it."
)

GEN_INSTRUCTION = """Write a concise ophthalmology clinical report for the given case.
- Use the provided image and keywords (if any).
- Prefer imaging-grounded findings and clear, clinically oriented language.
"""

def image_to_data_url(image_path: str) -> str:
    with open(image_path, "rb") as f:
        b = f.read()
    ext = Path(image_path).suffix.lower().lstrip(".")
    if ext == "jpg":
        ext = "jpeg"
    mime = f"image/{ext if ext else 'png'}"
    return f"data:{mime};base64," + base64.b64encode(b).decode("utf-8")

def generate_one_report(model_name: str, image_path: str, keywords: str) -> str:
    kw = (keywords or "").strip()
    user_txt = GEN_INSTRUCTION + (f"\n\nKeywords: {kw}\n" if kw else "\n\nKeywords: (none)\n")

    resp = client.responses.create(
        model=model_name,
        input=[
            {"role": "system", "content": GEN_SYSTEM},
            {"role": "user", "content": [
                {"type": "input_image", "image_url": image_to_data_url(image_path)},
                {"type": "input_text", "text": user_txt},
            ]},
        ],
        temperature=0,
        top_p=1,
        max_output_tokens=800,
    )
    return resp.output_text.strip()

def load_model_reports_jsonl(path: Path) -> List[Dict[str, Any]]:
    if not path.exists():
        return []
    rows=[]
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line: 
                continue
            rows.append(json.loads(line))
    return rows

# Generate reports unless an existing JSONL is already present.
if MODEL_REPORTS_JSONL.exists():
    print(f"Using existing model reports: {MODEL_REPORTS_JSONL}")
else:
    out_rows=[]
    for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
        sid = row["sample_id"]
        img_path = row["image_path"]
        kw = row.get("keywords","")
        if not Path(img_path).exists():
            raise FileNotFoundError(f"Missing image: {img_path}")

        for m in GENERATOR_MODELS:
            txt = generate_one_report(m, img_path, kw)
            out_rows.append({
                "sample_id": sid,
                "model_tag": m,
                "image_rel": row["image_rel"],
                "keywords": kw,
                "report_text": txt,
            })

    with open(MODEL_REPORTS_JSONL, "w", encoding="utf-8") as f:
        for r in out_rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"Saved model reports to {MODEL_REPORTS_JSONL}")

# Quick sanity check
rows = load_model_reports_jsonl(MODEL_REPORTS_JSONL)
print("Model report rows:", len(rows))
print("Example row keys:", list(rows[0].keys()) if rows else None)

## Stage II — Template filling (two-step extraction)

Step (1): the evaluator reads the report + Level-1 category directory, then selects relevant Level-1 categories.  
Step (2): the evaluator reads the selected categories' Level-2/3 entities and extracts localized entity descriptions with short verbatim evidence.


In [None]:
# =========================
# 4) STAGE II: TEMPLATE FILLING
# =========================

FILL_SYSTEM = (
    "You are an information extraction engine. "
    "You will be given a clinical report and an entity catalog."
)

SELECT_PROMPT = """Given the report, select the relevant Level-1 categories from the directory.
Return JSON only: {{"selected_categories": [ ... ]}}.

Level-1 category directory:
{category_directory}

Report:
{report_text}
"""

EXTRACT_PROMPT = """Extract entity instances from the report using ONLY the provided entity catalog.

Rules:
- Strict exclusion: if NOT explicitly supported by the text, OMIT it.
- Do NOT output unknown/null/none/absent placeholders.
- Evidence must be a SHORT verbatim snippet copied from the text.
- Output JSON only as: {{"extractions": [{{"entity": str, "value": str, "evidence": str}} ... ]}}.

Entity catalog (paths + short descriptions):
{entity_catalog}

Report:
{report_text}
"""

def build_level1_categories(slots: List[Dict[str, Any]]) -> List[str]:
    cats=set()
    for s in slots:
        p = s.get("path","")
        if not p:
            continue
        cats.add(p.split(".")[0])
    return sorted(cats)

def build_entity_catalog_for_categories(slots: List[Dict[str, Any]], categories: List[str]) -> List[Dict[str, str]]:
    cats = set(categories)
    out=[]
    for s in slots:
        p = s.get("path","")
        if not p:
            continue
        if p.split(".")[0] in cats:
            out.append({"path": p, "description": s.get("description","")})
    return out

def select_categories(report_text: str, level1_categories: List[str]) -> List[str]:
    directory = "\n".join([f"- {c}" for c in level1_categories])
    user = SELECT_PROMPT.format(category_directory=directory, report_text=report_text)

    obj = call_llm_json(EVAL_MODEL_TEXT, FILL_SYSTEM, user, max_output_tokens=2000)
    sel = obj.get("selected_categories", [])
    if not isinstance(sel, list):
        sel=[]
    sel = [str(x) for x in sel if str(x) in set(level1_categories)]
    return sel

def extract_entities(report_text: str, entity_catalog: List[Dict[str, str]]) -> List[Dict[str, str]]:
    # Keep catalog compact to reduce token cost
    catalog_lines = []
    for e in entity_catalog:
        desc = (e.get("description","") or "").strip()
        desc = desc[:120]  # truncate
        catalog_lines.append(f"- {e['path']}: {desc}")
    user = EXTRACT_PROMPT.format(entity_catalog="\n".join(catalog_lines), report_text=report_text)

    obj = call_llm_json(EVAL_MODEL_TEXT, FILL_SYSTEM, user, max_output_tokens=12000)
    exts = obj.get("extractions", [])
    if not isinstance(exts, list):
        return []

    cleaned=[]
    for it in exts:
        if not isinstance(it, dict):
            continue
        ent = it.get("entity")
        val = it.get("value")
        ev  = it.get("evidence")
        if not ent or not val:
            continue
        cleaned.append({
            "entity": str(ent),
            "value": str(val),
            "evidence": "" if ev is None else str(ev),
        })
    return cleaned

level1_categories = build_level1_categories(template.get("slots", []))
print("Level-1 categories:", level1_categories)

def fill_reports_to_jsonl(df_reports: pd.DataFrame, text_col: str, out_path: Path, extra_fields: List[str]) -> None:
    with open(out_path, "w", encoding="utf-8") as f:
        for _, row in tqdm(df_reports.iterrows(), total=len(df_reports)):
            report_text = row[text_col]
            sel = select_categories(report_text, level1_categories)
            catalog = build_entity_catalog_for_categories(template.get("slots", []), sel)
            exts = extract_entities(report_text, catalog)

            out = {
                "sample_id": row["sample_id"],
                "selected_categories": sel,
                "extractions": exts,
            }
            for k in extra_fields:
                out[k] = row.get(k, "")
            f.write(json.dumps(out, ensure_ascii=False) + "\n")

# 4.1 Fill doctor reports (reference)
if DOCTOR_FILLED_JSONL.exists():
    print(f"Using existing doctor-filled JSONL: {DOCTOR_FILLED_JSONL}")
else:
    tmp = df_test.copy()
    tmp["report_text"] = tmp["doctor_report"]
    fill_reports_to_jsonl(tmp, text_col="report_text", out_path=DOCTOR_FILLED_JSONL, extra_fields=["image_rel", "keywords"])
    print(f"Saved doctor-filled JSONL: {DOCTOR_FILLED_JSONL}")

# 4.2 Fill model reports (generated)
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows=[]
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

if MODEL_FILLED_JSONL.exists():
    print(f"Using existing model-filled JSONL: {MODEL_FILLED_JSONL}")
else:
    model_rows = read_jsonl(MODEL_REPORTS_JSONL)
    df_m = pd.DataFrame(model_rows)
    if df_m.empty:
        raise ValueError("MODEL_REPORTS_JSONL is empty. Run generation stage or provide your own model reports JSONL.")
    fill_reports_to_jsonl(df_m, text_col="report_text", out_path=MODEL_FILLED_JSONL, extra_fields=["model_tag", "image_rel", "keywords"])
    print(f"Saved model-filled JSONL: {MODEL_FILLED_JSONL}")

# quick stats
doc_rows = read_jsonl(DOCTOR_FILLED_JSONL)
mod_rows = read_jsonl(MODEL_FILLED_JSONL)
print("Doctor filled rows:", len(doc_rows), "Model filled rows:", len(mod_rows))
print("Example doctor extractions:", doc_rows[0]["extractions"][:2] if doc_rows else None)

## Stage III — Pairwise judging (Aligned / Mismatched / Omitted / Extra)

We judge each test case by comparing entity sets extracted from the doctor report vs the model report.  
The judge performs semantic matching, classifies doctor-side entities into {Aligned/Mismatched/Omitted}, and then marks any remaining unmatched model entities as Extra.


In [None]:
# =========================
# 5) STAGE III: ENTITY-WISE JUDGING
# =========================

JUDGE_SYSTEM = (
    "You judge whether each DOCTOR entity is supported by the MODEL report, "
    "and whether the model is consistent vs conflicting. "
    "Then identify any MODEL entities not mentioned by the doctor as EXTRA."
)

JUDGE_PROMPT = """You will be given:
- doctor_entities: a list of entities extracted from the doctor report (each has entity + value)
- model_entities: a list of entities extracted from the model report (each has entity + value)

Task:
1) For EACH doctor entity, find whether the model has semantically corresponding content (even if under a different entity name).
   - If consistent: label ALIGNED
   - If contradictory on key clinical attributes: label MISMATCHED
   - If not covered by model anywhere: label OMITTED
2) After matching against all doctor entities, any remaining unmatched model entity must be labeled EXTRA.

Be strict: only mark ALIGNED when the model clearly states the same finding/diagnosis.
Output JSON only:
{
  "doctor_labels": [
     {"doctor_index": int, "label": "ALIGNED|MISMATCHED|OMITTED", "matched_model_indices": [int,...], "note": str}
  ],
  "extra_model_indices": [int, ...]
}
doctor_entities:
{doctor_entities}

model_entities:
{model_entities}
"""

def judge_one(doctor_extractions: List[Dict[str, str]], model_extractions: List[Dict[str, str]]) -> Dict[str, Any]:
    # compact serialization for prompt
    d_ser = json.dumps([{"entity": x["entity"], "value": x["value"]} for x in doctor_extractions], ensure_ascii=False)
    m_ser = json.dumps([{"entity": x["entity"], "value": x["value"]} for x in model_extractions], ensure_ascii=False)
    user = JUDGE_PROMPT.format(doctor_entities=d_ser, model_entities=m_ser)
    obj = call_llm_json(EVAL_MODEL_TEXT, JUDGE_SYSTEM, user, max_output_tokens=6000)
    return obj

def build_scope_group(entity_path: str, oos_prefixes: List[str]) -> str:
    for p in oos_prefixes:
        if entity_path.startswith(p):
            return "OOS"
    return "IS"

# Load filled JSONLs
doctor_rows = read_jsonl(DOCTOR_FILLED_JSONL)
model_rows = read_jsonl(MODEL_FILLED_JSONL)

# Index doctor by sample_id
doctor_by_id = {r["sample_id"]: r for r in doctor_rows}

out_rows=[]

for mr in tqdm(model_rows, total=len(model_rows)):
    sid = mr["sample_id"]
    dr = doctor_by_id.get(sid)
    if dr is None:
        continue

    d_ext = dr.get("extractions", [])
    m_ext = mr.get("extractions", [])

    judged = judge_one(d_ext, m_ext)

    # doctor-side labels
    doctor_labels = judged.get("doctor_labels", [])
    extra_idx = set(judged.get("extra_model_indices", []))

    # write per-doctor-entity records
    for item in doctor_labels:
        di = item.get("doctor_index")
        lab = item.get("label")
        mm = item.get("matched_model_indices", [])
        note = item.get("note","")
        if di is None or di < 0 or di >= len(d_ext):
            continue
        ent = d_ext[di]["entity"]
        scope = build_scope_group(ent, OOS_PREFIXES)
        out_rows.append({
            "sample_id": sid,
            "model_tag": mr.get("model_tag",""),
            "side": "doctor",
            "entity": ent,
            "value": d_ext[di]["value"],
            "label": lab,
            "scope": scope,
            "matched_model_indices": json.dumps(mm, ensure_ascii=False),
            "note": note,
        })

    # write model-side extras
    for mi in extra_idx:
        if mi < 0 or mi >= len(m_ext):
            continue
        ent = m_ext[mi]["entity"]
        scope = build_scope_group(ent, OOS_PREFIXES)
        out_rows.append({
            "sample_id": sid,
            "model_tag": mr.get("model_tag",""),
            "side": "model",
            "entity": ent,
            "value": m_ext[mi]["value"],
            "label": "EXTRA",
            "scope": scope,
            "matched_model_indices": "[]",
            "note": "",
        })

df_sem = pd.DataFrame(out_rows)
df_sem.to_csv(SEMANTIC_CSV, index=False)
print(f"Saved semantic CSV: {SEMANTIC_CSV}")
df_sem.head()

## Aggregation — Export compact tables (for paper)

This section aggregates the semantic CSV into:

- scope composition (IS vs OOS coverage in doctor references and model assertions)  
- outcome rates by scope group  
- outcome rates by model  
- where Omit / Extra come from (prefix decomposition)

All tables are saved to `label_stats_summary.xlsx`.


In [None]:
# =========================
# 6) AGGREGATION + TABLE EXPORTS
# =========================

df_sem = pd.read_csv(SEMANTIC_CSV)

def prefix1(p: str) -> str:
    return str(p).split(".")[0] if isinstance(p, str) else ""

def prefix2(p: str) -> str:
    sp = str(p).split(".")
    return ".".join(sp[:2]) if len(sp) >= 2 else sp[0]

# 6.1 Scope composition (what fraction of entities are OOS vs IS?)
# - Doctor side: reference entities (Aligned/Mismatched/Omitted are all doctor-side)
# - Model side: extras (model-side)
doctor_side = df_sem[df_sem["side"]=="doctor"].copy()
model_side  = df_sem[df_sem["side"]=="model"].copy()

scope_comp_doctor = (doctor_side["scope"].value_counts(dropna=False).rename_axis("scope").reset_index(name="count"))
scope_comp_doctor["share_%"] = 100.0 * scope_comp_doctor["count"] / scope_comp_doctor["count"].sum()

scope_comp_model_extra = (model_side["scope"].value_counts(dropna=False).rename_axis("scope").reset_index(name="count"))
scope_comp_model_extra["share_%"] = 100.0 * scope_comp_model_extra["count"] / scope_comp_model_extra["count"].sum()

# 6.2 Outcome rates by scope (doctor-side labels + model-side extras)
# For outcomes, we use a unified view:
# - doctor-side labels are ALIGNED/MISMATCHED/OMITTED
# - model-side labels are EXTRA
def outcome_table(df: pd.DataFrame, group_col: str) -> pd.DataFrame:
    pivot = (df.pivot_table(index=group_col, columns="label", values="entity", aggfunc="count", fill_value=0))
    # ensure columns exist
    for c in ["ALIGNED","MISMATCHED","OMITTED","EXTRA"]:
        if c not in pivot.columns:
            pivot[c]=0
    pivot = pivot[["ALIGNED","MISMATCHED","OMITTED","EXTRA"]].reset_index()
    pivot["n"] = pivot[["ALIGNED","MISMATCHED","OMITTED","EXTRA"]].sum(axis=1)
    for c in ["ALIGNED","MISMATCHED","OMITTED","EXTRA"]:
        pivot[c] = 100.0 * pivot[c] / pivot["n"].replace(0,1)
    return pivot

# scope group table: need to combine doctor + model extras and group by scope
scope_outcome = outcome_table(df_sem, "scope")

# model table
model_outcome = outcome_table(df_sem, "model_tag")

# 6.3 Where OMIT and EXTRA come from (prefix2 breakdown)
omit_rows = doctor_side[doctor_side["label"]=="OMITTED"].copy()
extra_rows = model_side[model_side["label"]=="EXTRA"].copy()

omit_rows["prefix2"] = omit_rows["entity"].apply(prefix2)
extra_rows["prefix2"] = extra_rows["entity"].apply(prefix2)

omit_comp = omit_rows["prefix2"].value_counts().rename_axis("prefix2").reset_index(name="count")
omit_comp["share_%"] = 100.0 * omit_comp["count"] / omit_comp["count"].sum()

extra_comp = extra_rows["prefix2"].value_counts().rename_axis("prefix2").reset_index(name="count")
extra_comp["share_%"] = 100.0 * extra_comp["count"] / extra_comp["count"].sum()

# Save to Excel (compact captions belong in paper text, not in tables)
with pd.ExcelWriter(SUMMARY_XLSX, engine="openpyxl") as w:
    scope_comp_doctor.to_excel(w, sheet_name="scope_comp_doctor", index=False)
    scope_comp_model_extra.to_excel(w, sheet_name="scope_comp_model_extra", index=False)
    scope_outcome.to_excel(w, sheet_name="outcome_by_scope", index=False)
    model_outcome.to_excel(w, sheet_name="outcome_by_model", index=False)
    omit_comp.head(50).to_excel(w, sheet_name="omit_prefix2_top50", index=False)
    extra_comp.head(50).to_excel(w, sheet_name="extra_prefix2_top50", index=False)

print(f"Saved summary tables to: {SUMMARY_XLSX}")
print("Doctor scope composition:\n", scope_comp_doctor)
print("Outcome by scope:\n", scope_outcome)