# PCS‑HELIO v4.3 — 03 · Process ZuCo (v1+v2)
Reproducible ET+EEG token table. Data contract: writes `data/processed/zuco_aligned.csv` and `reports/zuco_loader_qa.json`.

In [1]:
from pathlib import Path; import sys, os, json, numpy as np, pandas as pd
# Robust import of shared fragments regardless of CWD
ROOT = Path.cwd()
if (ROOT/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT.parent))
try:
    from notebooks._fragments import apply_style, preflight_checks, print_contract, qa_assertions, save_manifest
except Exception as e:
    print('[preflight] Failed importing notebooks._fragments:', e)
    def apply_style(): pass
    def preflight_checks(): pass
    def print_contract(): pass
    def qa_assertions(df, rules): pass
    def save_manifest(path, payload): Path(path).parent.mkdir(parents=True, exist_ok=True); Path(path).write_text(json.dumps(payload, indent=2))
np.random.seed(1234)
RUN_MODE = os.environ.get('RUN_MODE','sample')  # or 'full'
apply_style(); preflight_checks(); print_contract()
BASE = Path('.') ; DATA = BASE/'data' ; RAW = DATA/'raw_public' ; PROC = DATA/'processed' ; RPTS = BASE/'reports'
PROC.mkdir(parents=True, exist_ok=True); RPTS.mkdir(parents=True, exist_ok=True)


[STYLE] _style.css not found; proceeding.
[Preflight] Python: 3.12.11 | Platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
[Preflight] pandas: 2.3.2 | numpy: 1.26.4
[Preflight] Folders ready.


In [2]:
# Prefer existing processed CSV; else auto-discover; else full loader; else synthetic sample in 'sample' mode
aligned = PROC/'zuco_aligned.csv'
df, qa = None, {}
if aligned.exists():
    try:
        d = pd.read_csv(aligned)
        if len(d) > 0:
            df = d; qa = {'source':'existing','rows': int(len(df))}
    except Exception as e:
        print('[warn] Failed to read existing', aligned, e)
if df is None:
    try:
        sys.path.insert(0, str((BASE/'tools').resolve()))
        from auto_discover_zuco import load_first_existing, harmonize
        base = load_first_existing([
            PROC/'zuco_word_level_all_subjects.csv',
            PROC/'zuco_word_level_ZKW.csv',
            PROC/'zuco_v1_aligned.csv',
            PROC/'zuco_aligned_real_et_eeg.csv',
        ])
        if base is not None and not base.empty:
            df = harmonize(base)
            df.to_csv(aligned, index=False)
            qa = {'source':'auto_discover','rows': int(len(df))}
            (RPTS/'zuco_loader_qa.json').write_text(json.dumps(qa, indent=2))
    except Exception as e:
        print('[warn] Auto-discover failed:', e)
if df is None and RUN_MODE == 'full':
    try:
        sys.path.insert(0, str((BASE/'src').resolve()))
        from pcs_toolbox.zuco import load_all
        df, qa = load_all(write_outputs=True)
    except Exception as e:
        print('[error] Full loader failed:', e)
if df is None and RUN_MODE == 'sample':
    # Synthetic 5-row sample to keep CI green
    df = pd.DataFrame({
        'Dataset':['v1']*5,'Task':['SR']*5,'Subject':['Z01']*5,
        'SentenceID':[1,1,1,1,1],'w_pos':[1,2,3,4,5],
        'token':['The','cat','sat','on','mat'],
        'token_norm':['the','cat','sat','on','mat'],
        'FFD':[120,80,70,60,50],'GD':[150,100,90,70,60],'TRT':[200,150,140,120,110],'GPT':[210,160,150,140,130],
        'theta1':[np.nan]*5,'alpha1':[np.nan]*5,'beta1':[np.nan]*5,'gamma1':[np.nan]*5
    })
    df.to_csv(aligned, index=False)
    qa = {'source':'synthetic','rows': int(len(df))}
    (RPTS/'zuco_loader_qa.json').write_text(json.dumps(qa, indent=2))
print('Rows:', 0 if df is None else len(df))
display(df.head(10))
# QA assertions
if df is not None and len(df)>0:
    qa_assertions(df, {
        'required_cols': ['Dataset','Task','Subject','SentenceID','w_pos','token','token_norm'],
        'min_rows': 5 if RUN_MODE=='sample' else 50
    })
    save_manifest(RPTS/'zuco_manifest.json', { 'rows': int(len(df)), 'cols': list(df.columns) })


[warn] Auto-discover failed: No module named 'auto_discover_zuco'
Rows: 5


Unnamed: 0,Dataset,Task,Subject,SentenceID,w_pos,token,token_norm,FFD,GD,TRT,GPT,theta1,alpha1,beta1,gamma1
0,v1,SR,Z01,1,1,The,the,120,150,200,210,,,,
1,v1,SR,Z01,1,2,cat,cat,80,100,150,160,,,,
2,v1,SR,Z01,1,3,sat,sat,70,90,140,150,,,,
3,v1,SR,Z01,1,4,on,on,60,70,120,140,,,,
4,v1,SR,Z01,1,5,mat,mat,50,60,110,130,,,,


[QA] Basic checks passed.
[MANIFEST] Wrote reports/zuco_manifest.json


In [3]:
# Print minimal QA summary if available
try:
    qa_path = RPTS/'zuco_loader_qa.json'
    if qa_path.exists():
        print(qa_path.read_text())
    elif 'qa' in globals():
        print(json.dumps(qa.get('summary', qa), indent=2))
except Exception as e:
    print('[warn] No QA available:', e)


{
  "source": "synthetic",
  "rows": 5
}
