# Notebook 04 — ZuCo Models (v1+v2): KEC × Dataset/Task (v4.3, Upgraded)

OLS (clustered SE), FDR, MixedLM (Subject + vc SentenceID), Bootstrap (joblib), partial plots & EEG trends.


In [None]:
import os, time, warnings, re
from pathlib import Path
import numpy as np, pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
SEED=42; np.random.seed(SEED)
FIG_DIR=Path('figures/metrics'); FIG_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR=Path('data/processed'); PROC_DIR.mkdir(parents=True, exist_ok=True)
RPT_DIR=Path('reports'); RPT_DIR.mkdir(parents=True, exist_ok=True)
def heartbeat(m): print(f"[{time.strftime('%H:%M:%S')}] {m}")
def norm_token(s):
    if not isinstance(s,str): return s
    s=s.lower(); s=re.sub(r'[\W_]+','',s); return s
heartbeat('Env ready')


In [None]:
zuco=pd.read_csv(PROC_DIR/'zuco_aligned.csv') if (PROC_DIR/'zuco_aligned.csv').exists() else pd.DataFrame()
kec=pd.read_csv(PROC_DIR/'kec'/'metrics_en.csv') if (PROC_DIR/'kec'/'metrics_en.csv').exists() else pd.DataFrame()
for d in (zuco,kec):
    if len(d)>0: d.columns=[c.strip() for c in d.columns]
if len(zuco)>0: zuco['token_norm']=zuco.get('Word', zuco.get('word','')).astype(str).map(norm_token)
if len(kec)>0:
    src_word='word' if 'word' in kec.columns else ('Word' if 'Word' in kec.columns else None)
    if src_word: kec['token_norm']=kec[src_word].astype(str).map(norm_token)
merged = zuco.merge(kec[['token_norm','entropy','curvature','coherence']] if 'token_norm' in kec.columns else pd.DataFrame(),
                    on='token_norm', how='left', suffixes=('','_kec')) if len(zuco)>0 and len(kec)>0 else pd.DataFrame()
heartbeat(f'Merged shape: {merged.shape if len(merged)>0 else "N/A"}')


In [None]:
df=merged.copy()
if len(df)>0:
    df=df.rename(columns={'LogFreq':'log_freq'})
    for resp in ('TRT','GPT'):
        if resp in df: df[f'log_{resp}']=np.log1p(df[resp])
    for cat in ('Dataset','Task','Subject','SentenceID'):
        if cat in df: df[cat]=df[cat].astype(str)
    ET_COLS=[c for c in ('FFD','GD','log_TRT','log_GPT') if c in df.columns]
    KEC_COLS=[c for c in ('entropy','curvature','coherence') if c in df.columns]
    COVS=[c for c in ('length','log_freq','surprisal') if c in df.columns]
    heartbeat(f'ET:{ET_COLS} KEC:{KEC_COLS} COVS:{COVS}')


In [None]:
reading_results=[]
if len(df)>0 and 'Subject' in df.columns:
    for resp in [r for r in ('FFD','GD','log_TRT','log_GPT') if r in df.columns]:
        preds=[c for c in ['entropy','curvature','coherence'] if c in df.columns]
        preds+=[c for c in ['length','log_freq','surprisal'] if c in df.columns]
        preds+=[c for c in ['Dataset','Task'] if c in df.columns]
        if 'entropy' in df.columns and 'Task' in df.columns: preds.append('entropy:Task')
        if len(preds)<3: heartbeat(f'Skip {resp}'); continue
        formula=f"{resp} ~ "+' + '.join(preds)
        heartbeat(f'OLS: {formula}')
        try:
            m=smf.ols(formula, data=df).fit(); rob=m.get_robustcov_results(cov_type='cluster', groups=df['Subject'])
            coefs=rob.params.rename('coef').to_frame(); coefs['se']=rob.bse; coefs['t']=rob.tvalues; coefs['p']=rob.pvalues; coefs['response']=resp
            reading_results.append(coefs.reset_index().rename(columns={'index':'term'}))
        except Exception as e:
            warnings.warn(f'OLS failed for {resp}: {e}')
if reading_results:
    tbl=pd.concat(reading_results, ignore_index=True); (PROC_DIR/'models_reading_coeffs.csv').write_text(tbl.to_csv(index=False))
else:
    heartbeat('No OLS results')


In [None]:
inp=PROC_DIR/'models_reading_coeffs.csv'
if inp.exists():
    dfc=pd.read_csv(inp); outs=[]
    for resp,grp in dfc.groupby('response'):
        p=grp['p'].values; rej,q,_,_=multipletests(p, alpha=0.05, method='fdr_bh')
        g=grp.copy(); g['p_fdr_bh']=q; g['rej_fdr_bh_0.05']=rej; outs.append(g)
    fdr=pd.concat(outs, ignore_index=True); (PROC_DIR/'models_reading_coeffs_fdr.csv').write_text(fdr.to_csv(index=False))
    print('Saved FDR')
else:
    print('Skip FDR')


In [None]:
def partial_plot(dfin, response='FFD', xvar='entropy', controls=None, fixed_cats=None, n_points=100, savepath=None):
    controls = controls or []; fixed_cats = fixed_cats or {}
    need=[response,xvar]+controls+list(fixed_cats.keys())
    if not set(need).issubset(dfin.columns): return
    formula=response+' ~ '+xvar
    if controls: formula+=' + '+' + '.join(controls)
    for k in fixed_cats:
        if k in dfin.columns and dfin[k].dtype=='O': formula+=f' + C({k})'
    mod=smf.ols(formula, data=dfin.dropna(subset=need)).fit()
    xgrid=np.linspace(dfin[xvar].quantile(0.05), dfin[xvar].quantile(0.95), n_points)
    base={c: float(dfin[c].median()) for c in controls}; base.update(fixed_cats)
    grid=pd.DataFrame({xvar:xgrid, **base}); yhat=mod.predict(grid)
    plt.figure(); samp=dfin.dropna(subset=need).sample(min(2000,len(dfin.dropna(subset=need))), random_state=SEED)
    plt.scatter(samp[xvar], samp[response], alpha=0.2); plt.plot(xgrid, yhat)
    ttl=', '.join([f"{k}={v}" for k,v in fixed_cats.items()]) if fixed_cats else 'overall'
    plt.title(f'{response} ~ {xvar} | {ttl}'); plt.xlabel(xvar); plt.ylabel(response); plt.tight_layout()
    if savepath: plt.savefig(savepath, dpi=150)
    plt.show()
if len(df)>0 and {'FFD','entropy'}.issubset(df.columns):
    controls=[c for c in ['length','log_freq'] if c in df.columns]
    partial_plot(df,'FFD','entropy',controls,{},savepath=Path('figures/metrics')/'F2_reading_vs_KEC.png')
    if 'Dataset' in df:
        for ds in sorted(df['Dataset'].dropna().unique()):
            dsd=df[df['Dataset']==ds]
            partial_plot(dsd,'FFD','entropy',controls,{'Dataset':ds},savepath=Path('figures/metrics')/f'F2_reading_vs_KEC_{ds}.png')
    if 'Task' in df:
        for tk in sorted(df['Task'].dropna().unique()):
            tkd=df[df['Task']==tk]
            partial_plot(tkd,'FFD','entropy',controls,{'Task':tk},savepath=Path('figures/metrics')/f'F2_reading_vs_KEC_task_{tk}.png')


In [None]:
def per_subject_trends(dfin, eeg_col='ThetaPower', xvar='entropy', by='Dataset', saveprefix='F3_EEG_vs_KEC'):
    import math
    if not {'Subject',eeg_col,xvar}.issubset(dfin.columns): return
    groups=sorted(dfin[by].dropna().unique()) if (by and by in dfin.columns) else [None]
    for g in groups:
        dsub=dfin if g is None else dfin[dfin[by]==g]
        subs=sorted(dsub['Subject'].astype(str).unique()); nsub=len(subs)
        if nsub==0: continue
        ncols=4; nrows=int(math.ceil(nsub/ncols))
        plt.figure(figsize=(12,3*nrows))
        for i,s in enumerate(subs,1):
            ax=plt.subplot(nrows,ncols,i)
            sdf=dsub[dsub['Subject'].astype(str)==str(s)].dropna(subset=[eeg_col,xvar])
            if len(sdf)<5: ax.set_title(f'{s} (few)'); continue
            ax.scatter(sdf[xvar], sdf[eeg_col], alpha=0.3)
            try:
                m=smf.ols(f"{eeg_col} ~ {xvar}", data=sdf).fit(); xs=np.linspace(sdf[xvar].min(), sdf[xvar].max(), 50)
                ys=m.params['Intercept'] + m.params[xvar]*xs; ax.plot(xs, ys)
            except Exception: pass
            ax.set_title(str(s)); ax.set_xlabel(xvar); ax.set_ylabel(eeg_col)
        plt.tight_layout(); tag=f'_{by}_{g}' if g is not None else ''
        out=Path('figures/metrics')/f'{saveprefix}{tag}.png'; plt.savefig(out, dpi=150)
        plt.show()
if len(df)>0 and 'entropy' in df.columns and any(c in df.columns for c in ['ThetaPower','AlphaPower']):
    eegc='ThetaPower' if 'ThetaPower' in df.columns else 'AlphaPower'
    per_subject_trends(df, eeg_col=eegc, xvar='entropy', by='Dataset', saveprefix='F3_EEG_vs_KEC')


In [None]:
from statsmodels.regression.mixed_linear_model import MixedLM
def fit_mixedlm_ffd(dfin):
    req={'FFD','entropy','length','log_freq','Subject','SentenceID'}
    if not req.issubset(dfin.columns): return None
    use=['FFD','entropy','length','log_freq','Subject','SentenceID']
    if 'Task' in dfin.columns: use.append('Task')
    if 'Dataset' in dfin.columns: use.append('Dataset')
    mdf=dfin[use].dropna().copy()
    # dummies
    import pandas as pd
    for cat in ['Task','Dataset']:
        if cat in mdf.columns:
            dummies=pd.get_dummies(mdf[cat], prefix=cat, drop_first=True)
            mdf=pd.concat([mdf.drop(columns=[cat]), dummies], axis=1)
    exog_cols=[c for c in mdf.columns if c not in ['FFD','Subject','SentenceID']]
    exog=sm.add_constant(mdf[exog_cols]); endog=mdf['FFD']
    vc={'Sentence': '0 + C(SentenceID)'}
    try:
        model=MixedLM(endog, exog, groups=mdf['Subject'], vc_formula=vc)
        res=model.fit(reml=True, method='lbfgs')
        Path('data/processed')/'mixedlm_ffd_summary.txt'
        (Path('data/processed')/'mixedlm_ffd_summary.txt').write_text(str(res.summary()))
        print('Saved MixedLM summary')
        return res
    except Exception as e:
        warnings.warn(f'MixedLM failed: {e}'); return None
# Uncomment to run: _=fit_mixedlm_ffd(df)


In [None]:
from joblib import Parallel, delayed
def _boot_fit_once(groups, dfin, formula, rng):
    samp_groups=rng.choice(groups, size=len(groups), replace=True)
    import pandas as pd
    samp_df=pd.concat([dfin[dfin['Subject']==g] for g in samp_groups], ignore_index=True)
    try:
        m=smf.ols(formula, data=samp_df).fit(); return m.params
    except Exception: return None
def bootstrap_coefs_parallel(dfin, formula, group_col='Subject', B=1000, n_jobs=-1, seed=42):
    rng=np.random.default_rng(seed); groups=dfin[group_col].unique()
    seeds=rng.integers(0, 2**32-1, size=B, endpoint=True)
    res=Parallel(n_jobs=n_jobs, verbose=10)(delayed(_boot_fit_once)(groups, dfin, formula, np.random.default_rng(int(s))) for s in seeds)
    import pandas as pd
    return pd.DataFrame([r for r in res if r is not None])
# Example:
# if len(df)>0 and {'FFD','entropy','length','log_freq','Subject'}.issubset(df.columns):
#     boot=bootstrap_coefs_parallel(df[['FFD','entropy','length','log_freq','Subject']].dropna(),'FFD ~ entropy + length + log_freq',B=1000,n_jobs=-1)
#     boot.to_csv(Path('data/processed')/'boot_ols_ffd_entropy.csv', index=False)
#     print('Saved bootstrap')


In [None]:
import json
qa={'timestamp_utc':'2025-09-01T19:48:51.873406Z',
    'merged_rows': int(len(df)) if len(df)>0 else 0,
    'et_cols': [c for c in ('FFD','GD','log_TRT','log_GPT') if c in (df.columns if len(df)>0 else [])],
    'kec_cols':[c for c in ('entropy','curvature','coherence') if c in (df.columns if len(df)>0 else [])],
    'cat_covs':[c for c in ('Dataset','Task') if c in (df.columns if len(df)>0 else [])],
    'reading_coeffs_csv': 'data/processed/models_reading_coeffs.csv' if (Path('data/processed')/'models_reading_coeffs.csv').exists() else None,
    'reading_coeffs_fdr_csv': 'data/processed/models_reading_coeffs_fdr.csv' if (Path('data/processed')/'models_reading_coeffs_fdr.csv').exists() else None,
    'mixedlm_summary': 'data/processed/mixedlm_ffd_summary.txt' if (Path('data/processed')/'mixedlm_ffd_summary.txt').exists() else None,
    'fig_f2': [str(p) for p in Path('figures/metrics').glob('F2_reading_vs_KEC*.png')],
    'fig_f3': [str(p) for p in Path('figures/metrics').glob('F3_EEG_vs_KEC*.png')]
}
(Path('reports')/'qa_kec_models.json').write_text(json.dumps(qa, indent=2))
print('Saved QA')
