# Notebook 03 — ZuCo Loader (v1 & v2 Unification, v4.3)

**Purpose.** Load token-level ZuCo **v1** and **v2**, harmonize schema, and export a unified file for modeling.

**Expected inputs (if available):**
- `data/raw_public/zuco/v1/` and `data/raw_public/zuco/v2/` containing token-level CSV/TSV files; optionally `.mat` files (requires `scipy.io`).

**Outputs:**
- `data/processed/zuco_v1_aligned.csv`
- `data/processed/zuco_v2_aligned.csv`
- `data/processed/zuco_aligned.csv` (unified v1+v2)
- `reports/zuco_loader_qa.json` (coverage & sanity checks)

**Reproducibility.** Deterministic IO; schema normalization included.


In [None]:
import os, time, warnings, re
from pathlib import Path
import numpy as np, pandas as pd
RAW_V1 = Path('data/raw_public/zuco/v1')
RAW_V2 = Path('data/raw_public/zuco/v2')
PROC = Path('data/processed'); PROC.mkdir(parents=True, exist_ok=True)
RPTS = Path('reports'); RPTS.mkdir(parents=True, exist_ok=True)
def heartbeat(m): print(f"[{time.strftime('%H:%M:%S')}] {m}")
def normcols(df): df.columns=[c.strip() for c in df.columns]; return df
def norm_token(s):
    if not isinstance(s,str): return s
    s=s.lower(); s=re.sub(r'[\W_]+','',s); return s
COMMON=['Subject','SentenceID','Word','FFD','GD','TRT','GPT','ThetaPower','AlphaPower','Task','Dataset','length','log_freq','surprisal']
heartbeat('Env ready')


In [None]:
def _read_table_any(p: Path):
    try:
        if p.suffix.lower()=='.csv': return pd.read_csv(p)
        if p.suffix.lower() in {'.tsv','.txt'}: return pd.read_csv(p, sep='\t')
    except Exception as e:
        warnings.warn(f'Failed to read {p}: {e}')
    return pd.DataFrame()
def _ensure_cols(df, dataset_label: str, default_task='NR'):
    if df.empty: return df
    df=normcols(df)
    ren={}
    if 'word' in df and 'Word' not in df: ren['word']='Word'
    if 'sentence_id' in df and 'SentenceID' not in df: ren['sentence_id']='SentenceID'
    if 'subject' in df and 'Subject' not in df: ren['subject']='Subject'
    df=df.rename(columns=ren)
    if 'length' not in df and 'Word' in df: df['length']=df['Word'].astype(str).str.len()
    for c in ['log_freq','surprisal','ThetaPower','AlphaPower','FFD','GD','TRT','GPT']:
        if c not in df: df[c]=np.nan
    if 'Task' not in df: df['Task']=default_task
    df['Dataset']=dataset_label
    keep=[c for c in COMMON if c in df.columns]
    return df[keep]
def build_zuco_v(folder: Path, dataset_label: str, default_task='NR'):
    if not folder.exists(): return pd.DataFrame()
    files=[]
    for ext in ('*.csv','*.tsv','*.txt'): files+=list(folder.rglob(ext))
    dfs=[]
    for f in files:
        d=_read_table_any(f)
        if not d.empty: dfs.append(_ensure_cols(d, dataset_label, default_task))
    if not dfs: return pd.DataFrame()
    out=pd.concat(dfs, ignore_index=True)
    if 'Subject' in out: out['Subject']=out['Subject'].astype(str)
    if 'SentenceID' in out: out['SentenceID']=out['SentenceID'].astype(str)
    return out


In [None]:
df_v1=build_zuco_v(RAW_V1,'v1','NR')
df_v2=build_zuco_v(RAW_V2,'v2')
if not df_v1.empty:
    (PROC/'zuco_v1_aligned.csv').write_text(df_v1.to_csv(index=False)); heartbeat('Saved v1')
if not df_v2.empty:
    (PROC/'zuco_v2_aligned.csv').write_text(df_v2.to_csv(index=False)); heartbeat('Saved v2')
df_all=pd.concat([d for d in [df_v1,df_v2] if not d.empty], ignore_index=True)
if df_all.empty:
    warnings.warn('No data unified.')
else:
    (PROC/'zuco_aligned.csv').write_text(df_all.to_csv(index=False)); heartbeat('Saved unified')


In [None]:
qa={}
qa['files']={'zuco_v1_aligned.csv': (PROC/'zuco_v1_aligned.csv').exists(),
             'zuco_v2_aligned.csv': (PROC/'zuco_v2_aligned.csv').exists(),
             'zuco_aligned.csv': (PROC/'zuco_aligned.csv').exists()}
if (PROC/'zuco_aligned.csv').exists():
    df=pd.read_csv(PROC/'zuco_aligned.csv')
    for c in ['Subject','SentenceID','Word','Dataset','Task']:
        if c not in df.columns: df[c]=np.nan
    qa['rows_total']=int(len(df)); qa['subjects']=int(df['Subject'].nunique()) if 'Subject' in df else 0
    qa['by_dataset']=df.groupby('Dataset').size().to_dict() if 'Dataset' in df else {}
    qa['by_task']=df.groupby('Task').size().to_dict() if 'Task' in df else {}
    # coverage vs KEC (if exists)
    kec_path=PROC/'kec'/'metrics_en.csv'
    if kec_path.exists():
        k=pd.read_csv(kec_path)
        for dft in (df,k): dft.columns=[c.strip() for c in dft.columns]
        df['token_norm']=df['Word'].astype(str).str.lower().str.replace(r'[\W_]+','',regex=True)
        k['token_norm']=k['word'].astype(str).str.lower().str.replace(r'[\W_]+','',regex=True) if 'word' in k.columns else np.nan
        m=df.merge(k[['token_norm','entropy','curvature','coherence']], on='token_norm', how='left')
        qa['kec_coverage_pct']={c: float(m[c].notna().mean()*100.0) for c in ['entropy','curvature','coherence']}
    else:
        qa['kec_coverage_pct']=None
import json
(Path('reports')/'zuco_loader_qa.json').write_text(json.dumps(qa, indent=2))
heartbeat('Saved QA')
