# PCS‑HELIO v4.3 — 04 · Merge ZuCo ⟂ KEC
Join ZuCo token table with KEC metrics by token_norm; write merged CSV and QA.

In [1]:
from pathlib import Path; import json, sys, os, pandas as pd, numpy as np
# Robust import of shared fragments regardless of CWD
ROOT = Path.cwd()
if (ROOT/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT.parent))
try:
    from notebooks._fragments import apply_style, preflight_checks, print_contract, qa_assertions, save_manifest
except Exception as e:
    print('[preflight] Failed importing notebooks._fragments:', e)
    def apply_style(): pass
    def preflight_checks(): pass
    def print_contract(): pass
    def qa_assertions(df, rules): pass
    def save_manifest(path, payload): Path(path).parent.mkdir(parents=True, exist_ok=True); Path(path).write_text(json.dumps(payload, indent=2))
apply_style(); preflight_checks(); print_contract()
BASE=Path('.') ; DATA=BASE/'data' ; PROC=DATA/'processed' ; RPTS=BASE/'reports' ; FIG=BASE/'figures'/'metrics'
PROC.mkdir(parents=True, exist_ok=True); RPTS.mkdir(parents=True, exist_ok=True); FIG.mkdir(parents=True, exist_ok=True)


[STYLE] _style.css not found; proceeding.
[Preflight] Python: 3.12.11 | Platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
[Preflight] pandas: 2.3.2 | numpy: 1.26.4
[Preflight] Folders ready.


In [2]:
# Load inputs
zuco_path = PROC/'zuco_aligned.csv'
kec_path  = PROC/'kec'/'metrics_en.csv'
zuco = pd.read_csv(zuco_path) if zuco_path.exists() else pd.DataFrame()
kec  = pd.read_csv(kec_path)  if kec_path.exists()  else pd.DataFrame(columns=['token_norm','entropy','curvature','coherence'])
# Ensure token_norm on KEC side
if not kec.empty and 'token_norm' not in kec.columns:
    src = 'word' if 'word' in kec.columns else ('node' if 'node' in kec.columns else None)
    if src is not None:
        from pcs_toolbox.common import token_norm
        kec['token_norm'] = kec[src].astype(str).map(token_norm)
merged = zuco.merge(kec[['token_norm','entropy','curvature','coherence']], on='token_norm', how='left') if not zuco.empty else pd.DataFrame()
print('ZuCo rows:', len(zuco), 'Merged rows:', len(merged))
display(merged.head(10) if not merged.empty else merged)
# Save merged
out_path = PROC/'zuco_kec_merged.csv'
if not merged.empty:
    merged.to_csv(out_path, index=False)
    save_manifest(RPTS/'merge_manifest.json', {'rows': int(len(merged)), 'cols': list(merged.columns)})


ZuCo rows: 0 Merged rows: 0


In [3]:
# Coverage and QA
if not merged.empty:
    cover = merged[['entropy','curvature','coherence']].notna().mean()*100
    print('KEC coverage % (by column):')
    print(cover.round(1).to_string())
    # Missing by task
    if 'Task' in merged.columns:
        miss_task = (merged.assign(_miss = ~merged[['entropy','curvature','coherence']].notna().any(axis=1))
                      .groupby('Task')['_miss'].mean()*100).round(1).to_frame('missing_%')
        print(miss_task.to_string())
    qa = { 'rows_zuco': int(len(zuco)), 'rows_merged': int(len(merged)), 'coverage_pct': cover.round(2).to_dict() }
    (RPTS/'merge_qa.json').write_text(json.dumps(qa, indent=2))
    # Simple bar figure
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(4,3))
    cover.plot(kind='bar', ax=ax, color=['#4C78A8','#F58518','#54A24B'])
    ax.set_ylim(0,100); ax.set_ylabel('Coverage %'); ax.set_title('KEC coverage in ZuCo tokens');
    fig.tight_layout(); fig.savefig(FIG/'F2_kec_coverage.png', dpi=150); plt.close(fig)
    # QA asserts
    qa_assertions(merged, {'required_cols':['token_norm'], 'min_rows': 5})
else:
    print('[note] Nothing to merge (missing zuco_aligned.csv).')


[note] Nothing to merge (missing zuco_aligned.csv).
