In [None]:
!nvidia-smi

In [None]:
from __future__ import annotations

#standard imports
import re
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple
import pandas as pd
import os

# current working directory
os.getcwd()

In [None]:
#Setup & Helpers (merged explicit + generic adjuncts)

# -------------------- Age → bucket (PII-safe) --------------------
def bucket_age(age: Optional[object]) -> Optional[str]:
    try:
        if age is None or str(age).strip() == "":
            return None
        return "pediatric" if int(float(age)) < 18 else "adult"
    except Exception:
        return None

def _lc(s: Optional[str]) -> str:
    return "" if s is None else str(s).strip().lower()

# Status / subtype / base TB tokens
RE_STB = re.compile(r"\bstb\b|\bsecondary\b", re.I) # secondary tuberculosis
RE_ATB = re.compile(r"\batb\b|\bactive\s+tb\b|\bactive\b", re.I) # active tuberculosis
RE_NATB = re.compile(r"\bnatb\b|\binactive\s+tb\b|\binactive\b", re.I) # inactive tuberculosis
RE_TB_TOKEN = re.compile(r"\btb\b|\bptb\b|\btuberculosis\b", re.I) #other TB tokens

# Location patterns
# Bilateral / side-only
RE_BILAT_PTB = re.compile(r"\bbilateral\s+ptb\b|\bbilateral\b.*\bptb\b|\bptb\b.*\bbilateral\b", re.I)
RE_RIGHT_PTB = re.compile(r"\bright\s+ptb\b", re.I)
RE_LEFT_PTB = re.compile(r"\bleft\s+ptb\b", re.I)

# Bilateral upper fields
RE_BILAT_UPPER_FIELDS = re.compile(
    r"\b(ptb\s+in\s+the\s+bilateral\s+upper\s+field(s)?|upper\s+fields?\s+bilaterally|bilateral\s+upper\s+field(s)?)\b", re.I
)

# Side + “upper and middle fields”
RE_SIDE_UPPER_MIDDLE = re.compile(r"\b(left|right)\s+.*?\bupper\s+and\s+middle\s+field(s)?\b", re.I)

# Side + single lobe (upper) via “ptb in the right upper field/zone/lobe”
RE_RIGHT_UPPER_PTB = re.compile(r"\bright\s+upper\s+ptb\b|\bptb\s+in\s+the\s+right\s+upper\s+(field|zone|lobe)\b", re.I)
RE_LEFT_UPPER_PTB  = re.compile(r"\bleft\s+upper\s+ptb\b|\bptb\s+in\s+the\s+left\s+upper\s+(field|zone|lobe)\b", re.I)

# Generic “<side> <level> field/zone/lobe”
RE_SIDE = r"(left|right)"
RE_LEVEL = r"(upper|middle|lower)"
RE_FIELDLOB = re.compile(rf"\b{RE_SIDE}\s+{RE_LEVEL}\s+(field|zone|lobe)\b", re.I)

# Middle–lower fields (optionally side-aware)
RE_MIDDLE_LOWER_FIELDS = re.compile(r"\bmiddle\s+lower\s+field(s)?\b", re.I)

# Explicit extras (adjuncts) lexicon
# Pleurisy / pleuritis
RE_PLEURISY = re.compile(r"\bpleuritis\b|\bpleurisy\b", re.I)

# Pleural effusion
RE_PLEURAL_EFFUSION_SIDE = re.compile(r"\b(left|right)\s+pleural\s+effusion\b", re.I)
RE_PLEURAL_EFFUSION_ANY = re.compile(r"\bpleural\s+effusion\b", re.I)

# Pleural thickening
RE_PLEURAL_THICKENING_LOBE = re.compile(
    r"\bpleural\s+thickening\s+in\s+(left|right)\s+(upper|middle|lower)\s+(lobe|field|zone)\b", re.I
)
RE_PLEURAL_THICKENING_SIDE = re.compile(r"\b(left|right)\s+pleural\s+thickening\b", re.I)
RE_PLEURAL_THICKENING_ANY = re.compile(r"\bpleural\s+thickening\b", re.I)

# Pleural adhesions
RE_PLEURAL_ADH_LOBE = re.compile(
    r"\bpleural\s+adhesions?\s+in\s+(left|right)\s+(upper|middle|lower)\s+(lobe|field|zone)\b", re.I
)
RE_PLEURAL_ADH_ANY = re.compile(r"\bpleural\s+adhesions?\b", re.I)

# Decortication
RE_DECORTICATION = re.compile(
    r"\b(left|right)\s+pleural\s+change\s+after\s+decortication\b|\bpleural\s+change\s+after\s+decortication\b", re.I
)

# Parenchymal patterns
RE_FIBROUS_MAINLY = re.compile(r"\bmainly\s+(fibrous)\s+lesions\b", re.I)
RE_FIBROUS_ANY = re.compile(r"\bfibrous\b|\bfibrotic\b|\bfibrosis\b", re.I)
RE_HYPERPL_MAINLY = re.compile(r"\bmainly\s+(hyperplastic)\s+lesions\b", re.I)
RE_HYPERPL_ANY = re.compile(r"\bhyperplastic\b|\bhyperplasia\b", re.I)

# Cavitation
RE_CAVITY_LOBE = re.compile(
    r"\b(cavity\s+formation|large\s+cavity|cavity)\s+in\s+(left|right)\s+(upper|middle|lower)\s+(lobe|field|zone)\b", re.I
)
RE_CAVITY_ANY = re.compile(r"\bcav(itary|ity)\b|\bcavity\s+formation\b|\blarge\s+cavity\b", re.I)

# Generic adjuncts
ADJUNCTS_ORDER: Sequence[str] = (
    # patterns
    "infiltrative", "nodular", "consolidation", "granulomas", "scarring", "calcifications",
    # pleural/airway
    "left pleural effusion", "right pleural effusion", "pleural effusion",
    "pleural thickening", "pleural adhesions", "pneumothorax", "atelectasis", "volume loss", "tracheal deviation",
    # course
    "relapsed", "on treatment", "improving", "worsening",
    # microbiology
    "AFB positive", "AFB negative", "culture positive", "RNA probe positive",
    # comorbid
    "COPD", "emphysema", "scoliosis", "CHF",
    # caveat
    "uncertain etiology",
)

ADJUNCTS_PAT: Dict[str, re.Pattern] = {
    # patterns
    "infiltrative": re.compile(r"\binfiltrat(e|es|ion|ory)\b", re.I),
    "nodular": re.compile(r"\bnodul(ar|es?)\b", re.I),
    "consolidation": re.compile(r"\bconsolidat(e|ion|ed)\b", re.I),
    "granulomas": re.compile(r"\bgranuloma(s)?\b", re.I),
    "scarring": re.compile(r"\bscarr?ing\b", re.I),
    "calcifications": re.compile(r"\bcalcif(ication|ications|ied)\b", re.I),
    # pleural/airway
    "left pleural effusion": re.compile(r"\bleft pleural effusion\b", re.I),
    "right pleural effusion": re.compile(r"\bright pleural effusion\b", re.I),
    "pleural effusion": re.compile(r"\bpleural effusion\b", re.I),
    "pleural thickening": re.compile(r"\bpleural thickening\b|\bpleural change(s)?\b", re.I),
    "pleural adhesions": re.compile(r"\bpleural adhesions?\b|\bdecortication\b", re.I),
    "pneumothorax": re.compile(r"\bpneumothorax\b", re.I),
    "atelectasis": re.compile(r"\batelectasis\b", re.I),
    "volume loss": re.compile(r"\bvolume loss\b", re.I),
    "tracheal deviation": re.compile(r"\btracheal deviation\b", re.I),
    # course
    "relapsed": re.compile(r"\brelaps(e|ed|ing)\b", re.I),
    "on treatment": re.compile(r"\bon (hrz?e?|antitubercular|treatment)\b", re.I),
    "improving": re.compile(r"\bimprov(ing|ement)\b", re.I),
    "worsening": re.compile(r"\bworsen(ing|ed)\b", re.I),
    # microbiology
    "AFB positive": re.compile(r"\bafb (smears? )?pos(itive)?\b", re.I),
    "AFB negative": re.compile(r"\bafb (smears? )?neg(ative)?\b", re.I),
    "culture positive": re.compile(r"\bculture positive\b", re.I),
    "RNA probe positive": re.compile(r"\brna probes? pos(itive)?\b", re.I),
    # comorbid
    "COPD": re.compile(r"\bcopd\b", re.I),
    "emphysema": re.compile(r"\bemphysema\b", re.I),
    "scoliosis": re.compile(r"\bscoliosis\b", re.I),
    "CHF": re.compile(r"\bchf\b", re.I),
    # caveat
    "uncertain etiology": re.compile(r"\bntm\b.*\bnot\b.*\bmtb\b", re.I),
}

# otherl helpers
def _mk_age_prefix(age: Optional[object]) -> str:
    bucket = bucket_age(age)
    return f"This {bucket} chest radiograph" if bucket else "This chest radiograph"

def _dedupe(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for x in seq:
        if x and x not in seen:
            out.append(x); seen.add(x)
    return out

def _join_extras(extras: List[str]) -> str:
    extras = _dedupe(extras)
    return "" if not extras else "with " + ", ".join(extras)

#### Generator (CSV/DataFrame → <stem>.txt + augmented + low-confidence)

Implements the shorthand mappings (stb, atb, natb, tb, ptb, pleuritis/pleurisy…).

Normalizes fields/zones → lobes and supports special cases (bilateral upper fields, side + upper&middle).

Emits PII-safe sentences (adult/pediatric only).

Sets low_confidence = false for TB if any status/subtype/location/extras or even a bare “tb/ptb” token is present; only rows with truly empty/no-report notes are flagged.

In [None]:
# Generator

#Location resolution
def _resolve_location(note_lc: str) -> str:
    # Special: bilateral upper fields
    if RE_BILAT_UPPER_FIELDS.search(note_lc):
        return "on both upper lung lobes"

    # Special: side + "upper and middle fields"
    m_um = RE_SIDE_UPPER_MIDDLE.search(note_lc)
    if m_um:
        side = m_um.group(1).lower()
        return f"on the {side} lung in the upper and middle lobes"

    # Right/Left upper lobe specific
    if RE_RIGHT_UPPER_PTB.search(note_lc):
        return "in the right upper lobe"
    if RE_LEFT_UPPER_PTB.search(note_lc):
        return "in the left upper lobe"

    # Bilateral ptb
    if RE_BILAT_PTB.search(note_lc):
        return "on both lungs"

    # Right/Left ptb
    if RE_RIGHT_PTB.search(note_lc):
        return "on the right lung"
    if RE_LEFT_PTB.search(note_lc):
        return "on the left lung"

    # Middle lower fields (optionally side-aware if side is nearby)
    if RE_MIDDLE_LOWER_FIELDS.search(note_lc):
        # try to find a side within a short window
        side_near = re.search(r"(left|right).{0,20}middle\s+lower\s+field", note_lc, re.I)
        if side_near:
            return f"in the {side_near.group(1).lower()} lung in the middle and lower lobes"
        return "in the middle and lower lobes"

    # Fallback: any explicit "<side> <level> field/zone/lobe" → "in the <side> <level> lobe"
    m_lobe = RE_FIELDLOB.search(note_lc)
    if m_lobe:
        side, level = m_lobe.group(1).lower(), m_lobe.group(2).lower()
        return f"in the {side} {level} lobe"
    return ""  # unknown

# Status / subtype
def _resolve_status(note_lc: str) -> str:
    # precedence: inactive+treated → inactive post-treatment; else active; else inactive; else none
    inactive = bool(RE_NATB.search(note_lc))
    active = bool(RE_ATB.search(note_lc))
    treated = bool(re.search(r"\btreated\b|\bpost[- ]treat(ed|ment)\b", note_lc))
    if inactive and treated:
        return "inactive post-treatment "
    if active:
        return "active "
    if inactive:
        return "inactive "
    return ""

def _resolve_subtype(note_lc: str) -> str:
    return "secondary " if RE_STB.search(note_lc) else ""

# Extras
def _resolve_extras(note_lc: str) -> List[str]:
    extras: List[str] = []

    # pleurisy (side-aware if present as "left pleurisy")
    if RE_PLEURISY.search(note_lc):
        m_side = re.search(r"\b(left|right)\s+pleur(?:isy|itis)\b", note_lc)
        extras.append(f"{m_side.group(1).lower()} pleurisy" if m_side else "pleurisy")

    # pleural effusion (side-aware preferred)
    m_eff_side = RE_PLEURAL_EFFUSION_SIDE.search(note_lc)
    if m_eff_side:
        extras.append(f"{m_eff_side.group(1).lower()} pleural effusion")
    elif RE_PLEURAL_EFFUSION_ANY.search(note_lc):
        extras.append("pleural effusion")

    # pleural thickening
    m_th_lobe = RE_PLEURAL_THICKENING_LOBE.search(note_lc)
    if m_th_lobe:
        extras.append(f"pleural thickening in the {m_th_lobe.group(1).lower()} {m_th_lobe.group(2).lower()} lobe")
    else:
        m_th_side = RE_PLEURAL_THICKENING_SIDE.search(note_lc)
        if m_th_side:
            extras.append(f"{m_th_side.group(1).lower()} pleural thickening")
        elif RE_PLEURAL_THICKENING_ANY.search(note_lc):
            extras.append("pleural thickening")

    # pleural adhesions
    m_adh_lobe = RE_PLEURAL_ADH_LOBE.search(note_lc)
    if m_adh_lobe:
        # note: your examples omit "the" here
        extras.append(f"pleural adhesions in {m_adh_lobe.group(1).lower()} {m_adh_lobe.group(2).lower()} lobe")
    elif RE_PLEURAL_ADH_ANY.search(note_lc):
        extras.append("pleural adhesions")

    # decortication wording
    m_dec = RE_DECORTICATION.search(note_lc)
    if m_dec:
        side = m_dec.group(1).lower() if m_dec.group(1) else None
        extras.append(f"{side} pleural change after decortication" if side else "pleural change after decortication")

    # fibrous / hyperplastic (respect "mainly ... lesions")
    m_fm = RE_FIBROUS_MAINLY.search(note_lc)
    if m_fm:
        extras.append("mainly as fibrous lesions")
    elif RE_FIBROUS_ANY.search(note_lc):
        extras.append("fibrous changes")

    m_hm = RE_HYPERPL_MAINLY.search(note_lc)
    if m_hm:
        extras.append("mainly as hyperplastic lesions")
    elif RE_HYPERPL_ANY.search(note_lc):
        extras.append("hyperplastic changes")  # if "mainly ..." not used

    # cavity formation (prefer lobe-aware)
    m_cav_lobe = RE_CAVITY_LOBE.search(note_lc)
    if m_cav_lobe:
        side, level = m_cav_lobe.group(2).lower(), m_cav_lobe.group(3).lower()
        extras.append(f"cavity formation in the {side} {level} lobe")
    elif RE_CAVITY_ANY.search(note_lc):
        # no lobe; keep generic
        extras.append("cavity formation")
    return _dedupe(extras)

# TB sentence renderer
def render_tb_sentence(note: Optional[str], age: Optional[object]) -> Tuple[str, Dict[str, str], Dict[str, str], List[str], bool]:
    """
    Returns:
      sentence,
      status_map={'status': ...},
      meta_map={'subtype': 'secondary' or '', 'distribution': 'both|left|right|'},
      extras_list (for CSV),
      low_conf (revised to only flag truly empty notes)
    """
    note_lc = _lc(note)

    # Status/subtype
    status = _resolve_status(note_lc)  # e.g., "active ", "inactive post-treatment ", ""
    subtype = _resolve_subtype(note_lc) # "secondary " or ""

    # Location (no lobe abbreviations in final phrasing)
    location = _resolve_location(note_lc)

    # Extras
    extras = _resolve_extras(note_lc)

    # Build base
    base = f"{_mk_age_prefix(age)}, shows {status}{subtype}tuberculosis".replace("  ", " ")

    # Build tail (location + extras)
    tail_parts = []
    if location:
        tail_parts.append(location)
    if extras:
        tail_parts.append("with " + ", ".join(extras))

    sentence = base if not tail_parts else base + " " + " ".join(tail_parts)
    if not sentence.endswith("."):
        sentence += "."

    # Distribution tag for CSV (coarse)
    distribution = ""
    if "both lungs" in location or "both upper lung lobes" in location:
        distribution = "both"
    elif "on the left lung" in location or "in the left " in location:
        distribution = "left"
    elif "on the right lung" in location or "in the right " in location:
        distribution = "right"

    # Low-confidence policy (relaxed): flag only when no usable info AND note empty/no-report
    has_any_semantics = bool(status.strip() or subtype.strip() or location or extras or RE_TB_TOKEN.search(note_lc))
    low_conf = not has_any_semantics and (note_lc == "" or "no report" in note_lc)

    status_map = {
        "status": "active" if status.startswith("active")
                  else ("inactive post-treatment" if status.startswith("inactive post-treatment")
                        else ("inactive" if status.startswith("inactive") else "")),
    }
    meta_map = {
        "subtype": "secondary" if subtype.strip() else "",
        "distribution": distribution,
    }
    # we no longer expose lobes explicitly (text is human-readable); extras_list for CSV auditing
    return sentence, status_map, meta_map, extras, low_conf

# Normal sentence
def render_normal_sentence(age: Optional[object]) -> str:
    s = f"{_mk_age_prefix(age)}, shows normal lungs"
    return s + "." if not s.endswith(".") else s

# ---- Main generator: DataFrame -> files + augmented/low_conf DFs ----
def generate_reports_jupyter(df: pd.DataFrame,
                             reports_dir: Path,
                             filename_col: str = "Filename",
                             class_col: str = "Class",
                             notes_col: str = "Clinical_Notes",
                             age_col: str = "Age") -> Tuple[pd.DataFrame, pd.DataFrame]:
    reports_dir.mkdir(parents=True, exist_ok=True)
    rows_all: List[Dict[str, object]] = []
    rows_low: List[Dict[str, object]] = []

    for _, r in df.iterrows():
        fn = str(r[filename_col])
        cls = str(r[class_col]).strip().lower()
        note = None if pd.isna(r.get(notes_col, None)) else str(r.get(notes_col, ""))
        age = r.get(age_col, None)

        stem = Path(fn).stem
        out_txt = reports_dir / f"{stem}.txt"

        if cls == "normal":
            sent = render_normal_sentence(age)
            status_map = {"status": ""}
            meta_map = {"subtype": "", "distribution": ""}
            extras = []
            low_conf = False
        else:
            sent, status_map, meta_map, extras, low_conf = render_tb_sentence(note, age)

        # write file
        with open(out_txt, "w", encoding="utf-8") as f:
            f.write(sent + ("\n" if not sent.endswith("\n") else ""))

        row = {
            "Filename": fn,
            "Class": r[class_col],
            "Clinical_Notes": note if note is not None else "",
            "Age": "" if age is None or str(age).strip()=="" else int(float(age)),
            "status": status_map["status"],
            "subtype": meta_map["subtype"],
            "distribution": meta_map["distribution"],
            "extras": "; ".join(extras),
            "synthetic_report": sent,
            "low_confidence": "true" if low_conf else "false",
        }
        rows_all.append(row)
        if low_conf:
            rows_low.append(row)

    augmented_df = pd.DataFrame(rows_all)
    low_conf_df = pd.DataFrame(rows_low)
    return augmented_df, low_conf_df

#Run
df = pd.read_csv("/gpfs/gsfs12/users/rajaramans2/projects/omsakthi_multimodal/multimodal_shenzhen/shen_demo.csv")
reports_dir = Path("/gpfs/gsfs12/users/rajaramans2/projects/omsakthi_multimodal/multimodal_shenzhen/dataset/reports")
augmented_df, low_conf_df = generate_reports_jupyter(df, reports_dir)
augmented_df.to_csv("/augmented_reports.csv", index=False)
low_conf_df.to_csv("/low_confidence.csv", index=False)
display(augmented_df.head()); display(low_conf_df.head())

#### Validator (ensures structure + naming; flags PII leakage patterns)

The validator checks that:

1. A text file exists for every row as <stem>.txt.

2. Sentence starts with “This … chest radiograph”.

3. Normal rows contain “shows normal lungs”.

4. TB rows contain “shows … tuberculosis”.

5. Flags common punctuation issues and digits .


In [None]:
# Validator

def validate_reports(augmented_df: pd.DataFrame,
                             reports_dir: Path,
                             filename_col: str = "Filename",
                             class_col: str = "Class",
                             check_digits: bool = True) -> List[str]:
    issues: List[str] = []

    for _, r in augmented_df.iterrows():
        fn = str(r[filename_col])
        cls = str(r[class_col]).strip().lower()
        p = reports_dir / f"{Path(fn).stem}.txt"

        if not p.exists():
            issues.append(f"Missing report file: {p}")
            continue

        txt = p.read_text(encoding="utf-8").strip()

        # structure
        if not txt.startswith("This"):
            issues.append(f"Bad prefix: {p} -> {txt}")
        if "chest radiograph" not in txt:
            issues.append(f"Missing 'chest radiograph': {p} -> {txt}")

        # class-specific
        if cls == "normal":
            if "shows normal lungs" not in txt:
                issues.append(f"Normal template mismatch: {p} -> {txt}")
        else:
            if "shows" not in txt or "tuberculosis" not in txt:
                issues.append(f"TB template mismatch: {p} -> {txt}")

        # punctuation / spacing
        if not txt.endswith("."):
            issues.append(f"Missing trailing period: {p} -> {txt}")
        if "  " in txt:
            issues.append(f"Double space: {p} -> {txt}")
        if re.search(r",[^\s]", txt):
            issues.append(f"Comma not followed by space: {p} -> {txt}")

        # optional PII sanity: flag digits (ages) if accidentally leaked
        if check_digits and re.search(r"\d", txt):
            issues.append(f"Digits present (PII risk) in: {p} -> {txt}")

    return issues

# Run
issues = validate_reports(augmented_df, Path("/reports"))
if issues:
    print("VALIDATION ISSUES:")
    for s in issues: print(" -", s)
else:
    print("All reports validated cleanly.")

#### Generator for writing raw clinical notes to (CSV/DataFrame → <stem>.txt)

1. Reads shen_demo.csv (Shenzhen Metadata file from path),

2. Creates reports_default/,

3. writes one .txt per image stem (e.g., CHNCXR_0483_1.txt) containing the exact Clinical_Notes value (verbatim),

4. Guards against missing columns and duplicate filenames,

5. Prints a concise summary.

##### Behavior details

1. For CHNCXR_0001_0.png (0: normal class index) → CHNCXR_0001_0.txt, contents will be normal\n.

2. For CHNCXR_0483_1.png (1: TB class index) → CHNCXR_0483_1.txt, contents will be ptb in the bilateral upper fields, left pleurisy\n. (verbatim)

3. If Clinical_Notes is missing/NaN, an empty file is written (newline only), matching “write the value under the corresponding Clinical_Notes” literally.

4. Set OVERWRITE=False if preserving existing files; the code will then report how many were skipped.


In [None]:
# Config
SOURCE_CSV   = Path("/shen_demo.csv")
OUT_ROOT     = Path("/dataset")
OUT_DIR      = OUT_ROOT / "reports_default" # will be created if not present
OVERWRITE    = True # set False to skip writing files that already exist

# Helpers
def _read_metadata(csv_path: Path) -> pd.DataFrame:
    """
    Read shen_demo.csv and return a DataFrame with at least ['Filename', 'Clinical_Notes'].
    We preserve Clinical_Notes verbatim; filenames are required and must be unique.
    """
    df = pd.read_csv(csv_path)
    needed = {"Filename", "Clinical_Notes"}
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns in CSV: {sorted(missing)}")

    # Keep only the columns we need for this task
    df = df[["Filename", "Clinical_Notes"]].copy()

    # Basic sanity checks
    if df["Filename"].isna().any():
        bad = df[df["Filename"].isna()].index.tolist()
        raise ValueError(f"Found rows with missing Filename at indices: {bad[:10]}")

    # Filenames should be unique (one image per patient)
    if df["Filename"].duplicated().any():
        dups = df[df["Filename"].duplicated(keep=False)]["Filename"].tolist()
        raise ValueError(f"Duplicate Filenames detected (first 10): {dups[:10]}")

    # Normalize Clinical_Notes to string; keep NaNs as empty strings
    df["Clinical_Notes"] = df["Clinical_Notes"].astype(str)
    df.loc[df["Clinical_Notes"].isin(["nan", "NaN", "None"]), "Clinical_Notes"] = ""
    return df

def _write_notes_files(df: pd.DataFrame, out_dir: Path, overwrite: bool = True) -> Tuple[int, int]:
    """
    Create <stem>.txt for each Filename, writing the verbatim Clinical_Notes.
    Returns (num_written, num_skipped).
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    written = 0
    skipped = 0

    for _, row in df.iterrows():
        stem = Path(str(row["Filename"])).stem
        out_path = out_dir / f"{stem}.txt"

        if out_path.exists() and not overwrite:
            skipped += 1
            continue

        # Write verbatim; add trailing newline
        text = (row["Clinical_Notes"] or "").rstrip("\n")
        out_path.write_text(text + "\n", encoding="utf-8")
        written += 1

    return written, skipped
    
# Create the reports_default directory and write one .txt per image stem with Clinical_Notes verbatim
df_meta = _read_metadata(SOURCE_CSV)
n = len(df_meta)
written, skipped = _write_notes_files(df_meta, OUT_DIR, overwrite=OVERWRITE)

print(f"Output directory: {OUT_DIR}")
print(f"Rows in CSV: {n}")
print(f"Files written: {written}")
print(f"Files skipped (exist & OVERWRITE=False): {skipped}")

# Quick verification: ensure a .txt exists for every Filename in CSV
missing = []
for fn in df_meta["Filename"]:
    p = OUT_DIR / f"{Path(fn).stem}.txt"
    if not p.exists():
        missing.append(p.name)

if missing:
    print("❌ Missing files (first 10 shown):", missing[:10])
else:
    print("✅ One .txt per Filename present in reports_default.")

## END OF CODE