# Notebook 2 — **Domain Cleanup + Medical Normalization + QA** (Enhanced)

In [8]:

# ==== CONFIG (edit these) ====
blocks_dir = "outputs/run_001/01_blocks"     # input folder from Notebook 1
output_dir = "outputs/run_001/02_cleaned"    # output folder for this step
dictionary_path = "config/medical_terms.yml" # optional YAML mapping
fuzzy_cutoff = 0.86
max_corrections = 200

# Optional: QuickUMLS install path (folder with QuickUMLS data) — leave empty if not available
quickumls_path = ""  # e.g., "/data/QuickUMLS"


In [9]:

import os, re, json, difflib, yaml, math
from pathlib import Path
from typing import Dict, Any, List, Tuple
from dataclasses import dataclass, field
import pandas as pd
import matplotlib.pyplot as plt

try:
    import spacy
    _HAS_SPACY = True
except Exception:
    _HAS_SPACY = False

try:
    import scispacy  # noqa: F401
except Exception:
    pass

_NLP = None
_LINKER = None
if _HAS_SPACY:
    for model in ["en_core_sci_lg", "en_ner_bc5cdr_md", "en_ner_bionlp13cg_md"]:
        try:
            _NLP = spacy.load(model)
            print(f"[INFO] Loaded spaCy model: {model}")
            break
        except Exception as e:
            print(f"[WARN] Could not load {model}: {e}")
    if _NLP is not None:
        try:
            _NLP.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
            _LINKER = _NLP.get_pipe("scispacy_linker")
            print("[INFO] scispaCy UMLS linker enabled")
        except Exception as e:
            _LINKER = None
            print("[WARN] scispaCy linker not available:", e)

_MATCHER = None
QUICKUMLS_PATH = os.environ.get("QUICKUMLS_PATH", quickumls_path or "")
if QUICKUMLS_PATH and Path(QUICKUMLS_PATH).exists():
    try:
        from quickumls import QuickUMLS
        _MATCHER = QuickUMLS(QUICKUMLS_PATH)
        print(f"[INFO] QuickUMLS initialized from: {QUICKUMLS_PATH}")
    except Exception as e:
        print("[WARN] QuickUMLS not usable:", e)
        _MATCHER = None
else:
    if QUICKUMLS_PATH:
        print(f"[WARN] QuickUMLS path not found: {QUICKUMLS_PATH}")
    else:
        print("[INFO] QuickUMLS disabled (no path provided)")


[WARN] Could not load en_core_sci_lg: [E050] Can't find model 'en_core_sci_lg'. It doesn't seem to be a Python package or a valid path to a data directory.
[WARN] Could not load en_ner_bc5cdr_md: [E050] Can't find model 'en_ner_bc5cdr_md'. It doesn't seem to be a Python package or a valid path to a data directory.
[WARN] Could not load en_ner_bionlp13cg_md: [E050] Can't find model 'en_ner_bionlp13cg_md'. It doesn't seem to be a Python package or a valid path to a data directory.
[INFO] QuickUMLS disabled (no path provided)


In [10]:

blocks_dir = Path(blocks_dir).expanduser().resolve()
if not blocks_dir.exists():
    raise FileNotFoundError(f"Input folder not found: {blocks_dir}")
out_root = Path(output_dir).expanduser().resolve()
out_root.mkdir(parents=True, exist_ok=True)
qa_dir = out_root / "_qa"
qa_dir.mkdir(exist_ok=True)
print("[INFO] Input:", blocks_dir)
print("[INFO] Output:", out_root)


[INFO] Input: /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/01_blocks
[INFO] Output: /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/02_cleaned


In [11]:

BUILTIN_MAP = {
    "toabeculas": "trabeculae","trabeculas": "trabeculae","trabaculae": "trabeculae",
    "fotymoyphoys": "polymorphous","palato": "palate","pala": "palate","comozd": "composed",
    "necrosls": "necrosis","microscoplc": "microscopic","cribriforme": "cribriform",
}
try:
    yml = Path(dictionary_path)
    if yml.exists():
        ext_map = yaml.safe_load(yml.read_text(encoding="utf-8")) or {}
        if isinstance(ext_map, dict):
            BUILTIN_MAP.update(ext_map)
            print(f"[INFO] Loaded {len(ext_map)} extra dict terms from {yml}")
    else:
        print(f"[INFO] No external dictionary at {yml}")
except Exception as e:
    print("[WARN] Could not load YAML dictionary:", e)

VOCAB = sorted(set(BUILTIN_MAP.values()).union({
    "histopathology","specimen","gross","microscopic","impression","pleomorphic",
    "trabeculae","necrosis","cribriform","carcinoma","fibrosis","lymphocyte",
    "ductal","invasive","lobular","metastatic","biopsy","stroma",
    "adenocarcinoma","immunohistochemistry"
}))
STOPWORDS_LOW_VALUE = {"and","the","with","for","from","this","that","show","note","cell","cells","tissue",
    "normal","mild","severe","of","to","in","at","by","is","are"}
DO_NOT_TOUCH = {"er","pr","her2","her2neu","ki-67","ki67","mm","cm","%","score","allred"}

def apply_rules(t: str) -> str:
    t2 = re.sub(r"[•·∙●]", ".", t)
    t2 = re.sub(r"[–—−]+", "-", t2)
    t2 = re.sub(r"[×✕✖]", "x", t2)
    t2 = re.sub(r"\s{2,}", " ", t2)
    return t2.strip()

def apply_dictionary(t, mp):
    def _case_aware(repl):
        def _f(m):
            src = m.group(0)
            return repl.upper() if src.isupper() else repl.title() if src.istitle() else repl
        return _f
    count, changes = 0, {}
    out = t
    for wrong, right in sorted(mp.items(), key=lambda kv: len(kv[0]), reverse=True):
        pat = rf"\b{re.escape(wrong)}\b"
        if re.search(pat, out, flags=re.I):
            out = re.sub(pat, _case_aware(right), out, flags=re.I)
            count += 1
            changes[wrong] = right
    return out, count, changes

def apply_fuzzy(t, vocab, cutoff=0.86, max_corr=200):
    toks = re.findall(r"[A-Za-z][A-Za-z\-]{2,}", t)
    uniq = sorted({
        x for x in (tok.lower() for tok in toks)
        if x not in STOPWORDS_LOW_VALUE | DO_NOT_TOUCH and not re.fullmatch(r"[a-z]*\d+[a-z\d\-]*", x)
    })
    repl, count = {}, 0
    for tok in uniq:
        best = difflib.get_close_matches(tok, vocab, n=1, cutoff=cutoff)
        if best:
            repl[tok] = best[0]
            count += 1
            if count >= max_corr: break
    def _swap(m):
        src = m.group(0); low = src.lower()
        r = repl.get(low)
        if not r: return src
        return r.upper() if src.isupper() else r.title() if src.istitle() else r
    out = re.sub(r"\b([A-Za-z][A-Za-z\-]{3,})\b", _swap, t)
    return out, len(repl), repl

def scispacy_normalize(t: str):
    if _NLP is None: return t, []
    ents = []
    try:
        doc = _NLP(t)
        for ent in doc.ents:
            item = {"text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char}
            if _LINKER is not None and getattr(ent._, "kb_ents", None):
                if ent._.kb_ents:
                    cui, score = ent._.kb_ents[0]
                    kb = _LINKER.kb.cui_to_entity.get(cui)
                    if kb:
                        item.update({"cui": cui, "canonical": kb.canonical_name, "link_score": float(score)})
            ents.append(item)
    except Exception:
        pass
    return t, ents

def quickumls_normalize(t: str):
    if _MATCHER is None: return t, []
    results = []
    try:
        groups = _MATCHER.match(t, best_match=True, ignore_syntax=False)
        for grp in groups:
            m = grp[0]
            results.append({
                "ngram": m["ngram"], "term": m["term"], "cui": m["cui"],
                "similarity": float(m["similarity"]), "start": m["start"], "end": m["end"]
            })
    except Exception:
        pass
    return t, results

def seq_sim(a: str, b: str) -> float:
    return difflib.SequenceMatcher(None, a, b).ratio()

@dataclass
class BlockQA:
    page: int
    index: int
    text_orig: str
    text_clean: str
    rules_corr: int = 0
    dict_corr: int = 0
    fuzzy_corr: int = 0
    dict_map: Dict[str,str] = None
    fuzzy_map: Dict[str,str] = None
    scispacy_ents: List[Dict[str,Any]] = None
    quickumls_hits: List[Dict[str,Any]] = None
    sim: float = 0.0


[INFO] Loaded 8 extra dict terms from config/medical_terms.yml


In [12]:

pages = sorted(blocks_dir.glob("page_*_blocks.json"))
if not pages:
    raise FileNotFoundError(f"No page_*_blocks.json under {blocks_dir}")
print(f"[INFO] Found {len(pages)} page files")


[INFO] Found 4 page files


In [13]:

all_rows = []
per_page_summary = []

for f in pages:
    page_no = int(re.search(r"page_(\d+)_", f.name).group(1))
    blocks = json.loads(f.read_text(encoding="utf-8"))
    out_blocks = []
    qa_items = []
    rules_changed_cnt = 0
    dict_total = 0
    fuzzy_total = 0

    for bi, b in enumerate(blocks):
        t0 = (b.get("text") or "").strip()
        if not t0:
            continue
        t1 = apply_rules(t0)
        rules_changed = int(t1 != t0)
        rules_changed_cnt += rules_changed

        t2, dict_n, dict_map = apply_dictionary(t1, BUILTIN_MAP)
        dict_total += dict_n

        t3, fuzzy_n, fuzzy_map = apply_fuzzy(t2, VOCAB, cutoff=float(fuzzy_cutoff), max_corr=int(max_corrections))
        fuzzy_total += fuzzy_n

        t4, ents = scispacy_normalize(t3)
        t5, qhits = quickumls_normalize(t4)

        nb = dict(b)
        nb["text_cleaned"] = t5
        nb["norm_meta"] = {
            "rules_applied": bool(rules_changed),
            "dict_corrections": dict_n,
            "fuzzy_corrections": fuzzy_n,
            "scispacy_ents": ents,
            "quickumls_hits": qhits
        }
        out_blocks.append(nb)

        qa_items.append(
            BlockQA(
                page=page_no, index=bi,
                text_orig=t0, text_clean=t5,
                rules_corr=rules_changed, dict_corr=dict_n, fuzzy_corr=fuzzy_n,
                dict_map=dict_map, fuzzy_map=fuzzy_map,
                scispacy_ents=ents, quickumls_hits=qhits,
                sim=seq_sim(t0, t5)
            )
        )

    out_file = (out_root / f"{f.stem}.domain.json")
    out_file.write_text(json.dumps(out_blocks, indent=2, ensure_ascii=False), encoding="utf-8")

    if qa_items:
        sim_mean = sum(x.sim for x in qa_items)/len(qa_items)
        fuzzy_sum = sum(x.fuzzy_corr for x in qa_items)
        dict_sum  = sum(x.dict_corr for x in qa_items)
        ents_cnt  = sum(len(x.scispacy_ents) for x in qa_items)
        cui_cnt   = sum(1 for x in qa_items for e in x.scispacy_ents if isinstance(e, dict) and "cui" in e)
        qmatch    = sum(len(x.quickumls_hits) for x in qa_items)

        per_page_summary.append({
            "page": page_no,
            "blocks": len(qa_items),
            "similarity_mean": round(sim_mean, 4),
            "dict_corrections": dict_sum,
            "fuzzy_corrections": fuzzy_sum,
            "rules_changed": rules_changed_cnt,
            "scispacy_ents": ents_cnt,
            "scispacy_cuis": cui_cnt,
            "quickumls_hits": qmatch
        })

        df_block = pd.DataFrame([x.__dict__ for x in qa_items])
        df_block.to_csv((out_root / "_qa" / f"page_{page_no:03d}_cleanup_blocks.csv"), index=False)

    print(f"✓ page {page_no:03d}: in={len(blocks)} → out={len(out_blocks)} | "
          f"rules={rules_changed_cnt}, dict={dict_total}, fuzzy={fuzzy_total} | "
          f"sim_mean={sim_mean if qa_items else 0:.3f} → {out_file.name}")


✓ page 001: in=8 → out=8 | rules=2, dict=0, fuzzy=4 | sim_mean=1.000 → page_001_blocks.domain.json
✓ page 002: in=12 → out=12 | rules=2, dict=0, fuzzy=27 | sim_mean=0.998 → page_002_blocks.domain.json
✓ page 003: in=8 → out=8 | rules=4, dict=0, fuzzy=0 | sim_mean=0.998 → page_003_blocks.domain.json
✓ page 004: in=9 → out=9 | rules=7, dict=0, fuzzy=0 | sim_mean=0.989 → page_004_blocks.domain.json


In [14]:

if per_page_summary:
    df = pd.DataFrame(per_page_summary).sort_values("page")
    df.to_csv(out_root / "_qa_cleanup.csv", index=False)
    print("\n✅ Domain cleanup complete.")
    print("Summary saved →", out_root / "_qa_cleanup.csv")
else:
    df = pd.DataFrame(columns=["page","blocks","similarity_mean","dict_corrections","fuzzy_corrections",
                               "rules_changed","scispacy_ents","scispacy_cuis","quickumls_hits"])
    print("[WARN] No QA rows produced.")

if len(df):
    fig, ax1 = plt.subplots(figsize=(6.5,3.2))
    x = df["page"].astype(str).str.zfill(3)
    ax1.plot(x, df["similarity_mean"], marker="o")
    ax1.set_ylabel("Similarity (0–1)")
    ax1.set_xlabel("page")
    ax1.set_ylim(0.90, 1.01)

    ax2 = ax1.twinx()
    ax2.bar(x, df["fuzzy_corrections"], alpha=0.30, label="Fuzzy Corrections")
    ax2.set_ylabel("Fuzzy Corrections")

    plt.title("OCR → Domain Cleanup QA Summary")
    fig.tight_layout()
    figpath = out_root / "_qa_cleanup_summary.png"
    plt.savefig(figpath, dpi=150)
    plt.close(fig)
    print("Saved QA plot →", figpath)

df if 'df' in locals() else None



✅ Domain cleanup complete.
Summary saved → /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/02_cleaned/_qa_cleanup.csv
Saved QA plot → /Users/balijepalli/Documents/GitHub/entheory-ai/notebooks/outputs/run_001/02_cleaned/_qa_cleanup_summary.png


Unnamed: 0,page,blocks,similarity_mean,dict_corrections,fuzzy_corrections,rules_changed,scispacy_ents,scispacy_cuis,quickumls_hits
0,1,8,0.9998,0,4,2,0,0,0
1,2,12,0.9981,0,27,2,0,0,0
2,3,8,0.9983,0,0,4,0,0,0
3,4,9,0.9888,0,0,7,0,0,0
