# 05 — Merge & Validate

Merge validated chunk JSONs into a single report; add lineage and metrics. Outputs final JSON + lineage + metrics.

In [5]:
# --- CONFIG ---
chunks_dir = "outputs/run_001/04_jsonextracted"
output_dir = "outputs/run_001/05_merged_validated"
reference_schema = "config/schema_prescription.json"
dedupe_keys = ["drug", "strength", "frequency", "timing"]
schema_key = "prescription"


In [None]:

import json, time
from pathlib import Path

chunks_dir = Path(chunks_dir)
out_dir = Path(output_dir); out_dir.mkdir(parents=True, exist_ok=True)

pieces = sorted(chunks_dir.glob("final_valid_*.json"))

if not pieces:
    raise RuntimeError("No final_valid_* pieces found")

def merge_dict(a,b):
    for k,v in b.items():
        if k=="metadata": continue
        if k not in a or a[k] in ("",None,[],{}): a[k]=v; continue
        if isinstance(a[k],dict) and isinstance(v,dict): a[k]=merge_dict(a[k],v); continue
        if isinstance(a[k],list) and isinstance(v,list):
            seen=set(); out=[]
            for it in (a[k]+v):
                key = json.dumps(it, sort_keys=True) if isinstance(it,dict) else str(it)
                if key in seen: continue
                seen.add(key); out.append(it)
            a[k]=out; continue
    return a

merged = {}
for p in pieces:
    try:
        d = json.loads(p.read_text(encoding="utf-8"))
        merged = merge_dict(merged, d)
    except Exception as e:
        print("[WARN] Failed to merge piece:", p, e)

metrics = {"pieces": [p.name for p in pieces], "total_pieces": len(pieces), "timestamp": int(time.time())}
lineage = {"sources": [str(p) for p in pieces]}

final_path = out_dir / f"final_{schema_key}.json"
lineage_path = out_dir / f"final_{schema_key}_lineage.json"
metrics_path = out_dir / f"final_{schema_key}_metrics.json"

final_path.write_text(json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8")
lineage_path.write_text(json.dumps(lineage, ensure_ascii=False, indent=2), encoding="utf-8")
metrics_path.write_text(json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8")

print("Saved:", final_path)
print("Saved:", lineage_path)
print("Saved:", metrics_path)


RuntimeError: No final_valid_* pieces found

In [None]:
import fitz  # PyMuPDF
import easyocr
import json
from pathlib import Path

# ---------------------------
# OCR + text extraction
# ---------------------------
def extract_pdf_text(pdf_path: str, use_easyocr=True):
    doc = fitz.open(pdf_path)
    reader = easyocr.Reader(["en"], gpu=False) if use_easyocr else None

    pages = []
    for i, page in enumerate(doc):
        text_blocks = page.get_text("blocks")
        page_text = " ".join(b[4] for b in text_blocks if len(b) > 4 and b[4].strip())

        # OCR fallback for embedded images
        ocr_texts = []
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            if reader:
                import numpy as np
                import cv2
                import io
                from PIL import Image
                pil_img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
                results = reader.readtext(np.array(pil_img))
                ocr_texts.extend([res[1] for res in results])

        full_text = page_text + "\n" + "\n".join(ocr_texts)
        pages.append(full_text)
    return "\n".join(pages)

# ---------------------------
# Simple rule-based structuring
# ---------------------------
def parse_reports(raw_text: str) -> dict:
    """
    Very naive keyword-based splitter for demonstration.
    In practice, you'd want regex / LLM cleanup.
    """
    sections = {
        "histopathology": "",
        "immunohistochemistry": "",
        "radiology": ""
    }
    current = None
    for line in raw_text.splitlines():
        low = line.lower()
        if "histopathology" in low:
            current = "histopathology"
        elif "immunohistochemistry" in low or "ihc" in low:
            current = "immunohistochemistry"
        elif "usg" in low or "ultrasonography" in low or "radiology" in low:
            current = "radiology"

        if current:
            sections[current] += line + "\n"

    combined = {
        "patient": {},   # you can add regexes to capture name/age/sex
        "reports": []
    }
    for key, text in sections.items():
        if text.strip():
            combined["reports"].append({
                "report_type": key,
                "raw_text": text.strip()
            })
    return combined

# ---------------------------
# Main driver
# ---------------------------
def process_pdf_to_json(pdf_path: str, out_path: str):
    raw_text = extract_pdf_text(pdf_path)
    structured = parse_reports(raw_text)
    Path(out_path).write_text(json.dumps(structured, indent=2, ensure_ascii=False))
    print(f"Saved structured JSON → {out_path}")

# Example run
if __name__ == "__main__":
    process_pdf_to_json(
        pdf_path = "input_pdfs/ET1-Adobe Scan 10 Sept 2025.pdf",
        out_path="combined_rizwana_begum.json"
    )


Using CPU. Note: This module is much faster with a GPU.


Saved structured JSON → combined_rizwana_begum.json
