# 04b — Extraction QA (Inputs & Valid Chunks)
Verifies integrity of input blocks and chunked valid outputs from 04a.

In [None]:
run_root = "outputs/run_001"

In [None]:
import json
from pathlib import Path

run_root = Path(run_root).expanduser().resolve()
in_candidates = [run_root / '03_llmcleaned', run_root / '02_cleaned', run_root / '01_blocks']
in_dir = None
for c in in_candidates:
    if any(c.glob('page_*_blocks*.json')):
        in_dir = c
        break
if not in_dir:
    raise FileNotFoundError(f'No input blocks found under {run_root}')

valid_dir = run_root / '04_jsonextracted'
if not valid_dir.exists():
    raise FileNotFoundError(f'Valid-chunk folder not found: {valid_dir}')

print(f'[QA] Inputs  → {in_dir}')
print(f'[QA] Valids  → {valid_dir}')


In [None]:
pages = sorted(in_dir.glob('page_*_blocks*.json'))
print(f'[QA] Found {len(pages)} input page files')
for p in pages:
    try:
        data = json.loads(p.read_text(encoding='utf-8'))
        if not isinstance(data, list):
            print(f'[WARN] {p.name}: not a list (type={type(data).__name__})')
            continue
        n = len(data)
        sample = data[0] if n else {}
        keys = list(sample.keys())[:6] if isinstance(sample, dict) else [type(sample).__name__]
        print(f'  ✓ {p.name}: {n} blocks, sample keys={keys}')
    except Exception as e:
        print(f'[ERROR] {p.name}: {e}')


In [None]:
def coerce_to_list_of_dicts(obj):
    if obj is None: return []
    if isinstance(obj, str):
        s = obj.strip()
        if not s: return []
        try: obj = json.loads(s)
        except Exception: return [{'_raw': s}]
    if isinstance(obj, dict): return [obj]
    if isinstance(obj, list):
        return [x if isinstance(x, dict) else {'_value': x} for x in obj]
    return [{'_value': obj}]

valids = sorted(valid_dir.glob('*_valid_*.json'))
print(f'[QA] Found {len(valids)} valid chunk files')
total = 0
for f in valids:
    try:
        obj = json.loads(f.read_text(encoding='utf-8'))
        rows = coerce_to_list_of_dicts(obj)
        total += len(rows)
        sample = rows[0] if rows else {}
        keys = list(sample.keys())[:6] if isinstance(sample, dict) else [type(sample).__name__]
        kind = type(obj).__name__
        print(f'  ✓ {f.name}: {kind}, {len(rows)} records, sample keys={keys}')
    except Exception as e:
        print(f'[ERROR] {f.name}: {e}')
print(f'[SUMMARY] Total records across valid chunks: {total}')
