# Notebook 04 — ZuCo Models: Reading Cost & EEG vs KEC (v4.3)

**Purpose.** Validate KEC metrics (transition entropy, local curvature, meso-coherence) against ZuCo eye-tracking (FFD, GD, TRT, GPT) and EEG.

**Inputs (expected):**
- `data/processed/kec/metrics_en.csv` — per-word KEC metrics (English)
- `data/processed/zuco_aligned.csv` — per-token ZuCo features (ET+EEG)

**Outputs:**
- Figures: `figures/metrics/F2_reading_vs_KEC.png`, `figures/metrics/F3_EEG_vs_KEC.png`
- Tables: `data/processed/models_reading_coeffs.csv`, `data/processed/models_eeg_coeffs.csv` (if EEG present)
- QA JSON: `reports/qa_kec_models.json` (optional)

**Reproducibility:** fixed seeds; minimal dependencies; deterministic IO.

In [11]:
import os, json, math, warnings, hashlib, time, random
from pathlib import Path
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

FIG_DIR = Path('../figures/metrics'); FIG_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR = Path('../data/processed'); PROC_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR = Path('../reports'); REPORTS_DIR.mkdir(parents=True, exist_ok=True)

def heartbeat(msg):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")
heartbeat('Environment ready')

[21:18:15] Environment ready


## 1) Load processed data and merge
Assumes the KEC table and the ZuCo-aligned table exist under `/data/processed/`.

In [20]:
kec_path = PROC_DIR/'kec'/'metrics_en.csv'
# Prefer multi-subject export; include v1 and v2 aligned if present
zuco_candidates = [
    PROC_DIR/'zuco_word_level_all_subjects.csv',
    PROC_DIR/'zuco_aligned.csv',
    PROC_DIR/'zuco_v1_aligned.csv',
    PROC_DIR/'zuco_v2_aligned.csv',
]

# Load KEC
kec_df = pd.read_csv(kec_path, low_memory=False) if kec_path.exists() else pd.DataFrame()

# Load and combine ZuCo (v1+v2 if available)
zuco_parts = []
for p in zuco_candidates:
    if p.exists():
        try:
            dfi = pd.read_csv(p, low_memory=False)
            dfi.columns = [c.strip() for c in dfi.columns]
            # ensure subject col (lowercase) for this notebook
            if 'subject' not in dfi.columns and 'Subject' in dfi.columns:
                dfi['subject'] = dfi['Subject'].astype(str)
            # ensure tsr indicator if possible
            if 'tsr' not in dfi.columns:
                if 'Task' in dfi.columns:
                    dfi['tsr'] = dfi['Task'].astype(str)
                else:
                    dfi['tsr'] = 'Unknown'
            # ensure Dataset exists
            if 'Dataset' not in dfi.columns:
                # infer from filename
                name = p.name
                if 'v2' in name:
                    dfi['Dataset'] = 'v2'
                elif 'v1' in name:
                    dfi['Dataset'] = 'v1'
                else:
                    dfi['Dataset'] = 'v1'
            zuco_parts.append(dfi)
        except Exception as e:
            warnings.warn(f'Failed reading {p}: {e}')

zuco_df = pd.concat(zuco_parts, ignore_index=True) if zuco_parts else pd.DataFrame()
heartbeat(f"Loaded KEC: {kec_df.shape if len(kec_df)>0 else 'MISSING'}; ZuCo combined: {zuco_df.shape if len(zuco_df)>0 else 'MISSING'}")

import re

def norm_token(s):
    if not isinstance(s,str): return s
    s = s.lower()
    s = re.sub(r'[\W_]+', '', s)
    return s

# normalize headers
for d in (kec_df, zuco_df):
    if len(d)>0: d.columns = [c.strip() for c in d.columns]

# token_norm sources for ZuCo
if len(zuco_df)>0:
    zuco_word_col = 'word' if 'word' in zuco_df.columns else ('Word' if 'Word' in zuco_df.columns else None)
    if zuco_word_col:
        zuco_df['token_norm'] = zuco_df[zuco_word_col].astype(str).map(norm_token)

# token_norm for KEC (prefer existing, else fallbacks)
if len(kec_df)>0:
    kec_df.columns = [c.strip() for c in kec_df.columns]
    kec_word_col = 'token_norm' if 'token_norm' in kec_df.columns else (
        'node' if 'node' in kec_df.columns else (
        'word' if 'word' in kec_df.columns else (
        'Word' if 'Word' in kec_df.columns else None)))
    if kec_word_col and 'token_norm' not in kec_df.columns:
        kec_df['token_norm'] = kec_df[kec_word_col].astype(str).map(norm_token)

# curvature alias
work_kec = kec_df.rename(columns={'avg_curvature':'curvature'}) if len(kec_df)>0 else kec_df

req = {'token_norm','entropy','coherence','curvature'}
if len(zuco_df)>0 and len(work_kec)>0 and req.issubset(work_kec.columns) and 'token_norm' in zuco_df.columns:
    merged = zuco_df.merge(work_kec[['token_norm','entropy','coherence','curvature']], on='token_norm', how='left', suffixes=('','_kec'))
else:
    merged = pd.DataFrame()
heartbeat(f"Merged shape: {merged.shape if len(merged)>0 else 'N/A'}; sources used: {[str(p) for p in zuco_candidates if p.exists()]}")
merged.head(3) if len(merged)>0 else merged

[21:21:01] Loaded KEC: (166540, 4); ZuCo combined: (112074, 21)
[21:21:01] Merged shape: (161064, 24); sources used: ['../data/processed/zuco_word_level_all_subjects.csv', '../data/processed/zuco_aligned.csv', '../data/processed/zuco_v1_aligned.csv']
[21:21:01] Merged shape: (161064, 24); sources used: ['../data/processed/zuco_word_level_all_subjects.csv', '../data/processed/zuco_aligned.csv', '../data/processed/zuco_v1_aligned.csv']


Unnamed: 0,Subject,Task,Dataset,SentenceID,Word,FFD,GD,ThetaPower,AlphaPower,token_norm,...,relation_type,WordPosition,SentenceID.1,TRT,GPT,control,sentence_id,entropy,coherence,curvature
0,ZAB,NR,v1,unknown,He,0.0,0.0,0.0,0.0,he,...,,,,,,,,4.9161,0.378378,553.340426
1,ZAB,NR,v1,unknown,won,0.0,0.0,0.0,0.0,won,...,,,,,,,,5.857516,0.452555,705.692982
2,ZAB,NR,v1,unknown,the,0.0,0.0,0.0,0.0,the,...,,,,,,,,6.219484,0.479532,964.234483


## 2) Prepare variables
- Define response variables and EEG columns (if available).
- Define predictors: KEC (`entropy`, `curvature`, `coherence`) and covariates (`length`, `log_freq`, optional `surprisal`).

In [13]:
if len(merged)==0:
    heartbeat('Merged DF empty — fill data and rerun.')

col_candidates = merged.columns.tolist()
ET_COLS = [c for c in ['duration_ms','fixation_samples','avg_gaze_x','avg_gaze_y','avg_pupil_area'] if c in col_candidates]  # Available ET metrics
EEG_COLS = [c for c in ['ThetaPower','AlphaPower'] if c in col_candidates]  # EEG if available
KEC_COLS = [c for c in ['entropy','avg_curvature','coherence'] if c in col_candidates]  # KEC metrics (note: curvature is avg_curvature)
COV_COLS = [c for c in ['length','LogFreq','log_freq','surprisal','word_idx','subject','tsr'] if c in col_candidates]
if 'LogFreq' in merged: merged = merged.rename(columns={'LogFreq':'log_freq'})

needed = set(ET_COLS + KEC_COLS + ['subject','tsr'])  # Updated column names
df = merged.copy()
if len(df)>0 and needed.issubset(set(df.columns)):
    df = df.dropna(subset=list(needed))
    heartbeat(f"Rows after NA drop: {len(df)}")
else:
    df = merged

if 'subject' in df: df['subject'] = df['subject'].astype(str)
if 'tsr' in df: df['tsr'] = df['tsr'].astype(str)

for resp in ['duration_ms','fixation_samples']:
    if resp in df:
        df[f'log_{resp}'] = np.log1p(df[resp])

heartbeat(f"ET columns: {ET_COLS}; EEG columns: {EEG_COLS}; KEC: {KEC_COLS}")

[21:18:15] ET columns: []; EEG columns: ['ThetaPower', 'AlphaPower']; KEC: ['entropy', 'coherence']


## 3) Baseline OLS with clustered robust SE (Subject)
Fit OLS for ET metrics (FFD/GD/log_TRT/log_GPT) with KEC + covariates and compute clustered robust SEs by Subject.

In [14]:
reading_results = []
if len(df)>0 and set(['subject']).issubset(df.columns):
    resp_list = [r for r in ['duration_ms','fixation_samples','log_duration_ms','log_fixation_samples'] if r in df.columns]
    for resp in resp_list:
        preds = [c for c in ['entropy','avg_curvature','coherence','word_idx'] if c in df.columns]
        if 'avg_curvature' in preds and 'curvature' not in df.columns:
            df = df.rename(columns={'avg_curvature':'curvature'})
            preds = [c if c!='avg_curvature' else 'curvature' for c in preds]
        preds = [p for p in preds if p in df.columns]
        if len(preds) < 1:
            heartbeat(f"Insufficient predictors for {resp}; skipping.")
            continue
        formula = f"{resp} ~ " + " + ".join(preds)
        heartbeat(f"Fitting OLS: {formula}")
        try:
            used = [resp] + preds + (['subject'] if 'subject' in df.columns else [])
            d = df.dropna(subset=[c for c in used if c in df.columns])
            if len(d) < 100:
                heartbeat(f"Skip {resp}: insufficient rows after dropna ({len(d)})")
                continue
            model = smf.ols(formula, data=d).fit()
            coefs = pd.DataFrame({
                'coef': model.params,
                'se': model.bse,
                't': model.tvalues,
                'p': model.pvalues
            })
            coefs['response'] = resp
            reading_results.append(coefs.reset_index().rename(columns={'index':'term'}))
            heartbeat(f"Model done for {resp}")
        except Exception as e:
            warnings.warn(f"OLS failed for {resp}: {e}")

if reading_results:
    reading_tbl = pd.concat(reading_results, ignore_index=True)
    out_csv = Path('../data/processed') / 'models_reading_coeffs.csv'
    reading_tbl.to_csv(out_csv, index=False)
    heartbeat(f"Saved reading coefficients → {out_csv}")
else:
    heartbeat('No reading models were fitted.')

[21:18:15] No reading models were fitted.


## 4) (Optional) Mixed-Effects Models
Use MixedLM for random intercepts by Subject (and optionally SentenceID) if needed. (Placeholder — disabled by default).

In [15]:
from statsmodels.regression.mixed_linear_model import MixedLM
if len(df)>0 and 'duration_ms' in df.columns:
     try:
         mdf = df[['duration_ms','entropy','avg_curvature','coherence','word_idx','subject']].dropna()
         exog = sm.add_constant(mdf[['entropy','avg_curvature','coherence','word_idx']])
         endog = mdf['duration_ms']
         groups = mdf['subject']
         mixed = MixedLM(endog, exog, groups=groups).fit()
         print(mixed.summary())
     except Exception as e:
         warnings.warn(f'MixedLM failed: {e}')

## 5) Figure F2 — Reading Cost vs KEC (partial effect)
Plot predicted relationship holding covariates constant (median).

In [16]:
def partial_plot(df, response='FFD', xvar='entropy', controls=None, n_points=100, savepath=None):
    if controls is None: controls = []
    cols_needed = [response, xvar] + controls
    if not set(cols_needed).issubset(df.columns):
        heartbeat(f"Missing columns for partial plot: {cols_needed}")
        return
    formula = response + ' ~ ' + xvar
    if controls:
        formula += ' + ' + ' + '.join(controls)
    mod = smf.ols(formula, data=df.dropna(subset=cols_needed)).fit()
    xgrid = np.linspace(df[xvar].quantile(0.05), df[xvar].quantile(0.95), n_points)
    base = {c: float(df[c].median()) for c in controls}
    grid_df = pd.DataFrame({xvar: xgrid, **base})
    yhat = mod.predict(grid_df)
    import matplotlib.pyplot as plt
    plt.figure()
    samp = df.dropna(subset=cols_needed).sample(min(2000, len(df)), random_state=SEED)
    plt.scatter(samp[xvar], samp[response], alpha=0.2)
    plt.plot(xgrid, yhat)
    plt.xlabel(xvar); plt.ylabel(response)
    ttl_ctrl = ', '.join([f"{c} @ median" for c in controls]) if controls else 'no controls'
    plt.title(f"Partial effect: {response} ~ {xvar} ({ttl_ctrl})")
    if savepath:
        plt.tight_layout(); plt.savefig(savepath, dpi=150)
        heartbeat(f"Saved figure → {savepath}")
    plt.show()

if len(df)>0 and 'duration_ms' in df.columns and 'entropy' in df.columns:
    controls = [c for c in ['word_idx'] if c in df.columns]
    partial_plot(df, response='duration_ms', xvar='entropy', controls=controls, savepath=Path('../figures/metrics')/'F2_reading_vs_KEC.png')
else:
    heartbeat('Skipping F2 (duration_ms~entropy) — required columns missing.')

[21:18:15] Skipping F2 (duration_ms~entropy) — required columns missing.


## 6) Figure F3 — EEG vs KEC (per-subject trends)

In [17]:
def per_subject_trends(df, eeg_col='avg_pupil_area', xvar='entropy', savepath=None):
    need = {'subject', eeg_col, xvar}
    if not need.issubset(df.columns):
        heartbeat(f"Missing columns for F3: {need}")
        return
    subs = sorted(df['subject'].unique())
    nsub = len(subs)
    if nsub==0:
        heartbeat('No subjects found for F3.')
        return
    import math
    import matplotlib.pyplot as plt
    ncols = 4
    nrows = int(math.ceil(nsub / ncols))
    plt.figure(figsize=(12, 3*nrows))
    for i, s in enumerate(subs, 1):
        ax = plt.subplot(nrows, ncols, i)
        sdf = df[df['subject']==s].dropna(subset=[eeg_col, xvar])
        if len(sdf)<5:
            ax.set_title(f"{s} (insufficient data)")
            continue
        ax.scatter(sdf[xvar], sdf[eeg_col], alpha=0.3)
        try:
            m = smf.ols(f"{eeg_col} ~ {xvar}", data=sdf).fit()
            xs = np.linspace(sdf[xvar].min(), sdf[xvar].max(), 50)
            ys = m.params['Intercept'] + m.params[xvar]*xs
            ax.plot(xs, ys)
        except Exception as e:
            pass
        ax.set_title(str(s)); ax.set_xlabel(xvar); ax.set_ylabel(eeg_col)
    plt.tight_layout()
    if savepath:
        plt.savefig(savepath, dpi=150)
        heartbeat(f"Saved figure → {savepath}")
    plt.show()

if len(df)>0 and {'subject','entropy'}.issubset(df.columns) and any(c in df.columns for c in ['avg_pupil_area']):
    eegc = 'avg_pupil_area'  # Use pupil area as proxy for EEG if no EEG data
    per_subject_trends(df, eeg_col=eegc, xvar='entropy', savepath=Path('../figures/metrics')/'F3_EEG_vs_KEC.png')
else:
    heartbeat('Skipping F3 — pupil area or entropy missing.')

[21:18:15] Skipping F3 — pupil area or entropy missing.


## 7) (Optional) Bootstrap CIs (clustered by Subject)
Bootstrap coefficients for `FFD ~ entropy + length + log_freq`.

In [18]:
def bootstrap_coefs(df, formula, group_col='subject', B=200, seed=42):
    rng = np.random.default_rng(seed)
    groups = df[group_col].unique()
    coefs = []
    for b in range(B):
        samp_groups = rng.choice(groups, size=len(groups), replace=True)
        samp_df = pd.concat([df[df[group_col]==g] for g in samp_groups], ignore_index=True)
        try:
            m = smf.ols(formula, data=samp_df).fit()
            coefs.append(m.params)
        except Exception:
            continue
    return pd.DataFrame(coefs)

if len(df)>0 and {'duration_ms','entropy','word_idx','subject'}.issubset(df.columns):
    formula = 'duration_ms ~ entropy + word_idx'
    heartbeat('Bootstrapping duration_ms model (B=200) ...')
    boot = bootstrap_coefs(df[['duration_ms','entropy','word_idx','subject']].dropna(), formula, group_col='subject', B=200)
    if len(boot)>0:
        outb = Path('../data/processed')/'boot_ols_duration_entropy.csv'
        boot.to_csv(outb, index=False)
        heartbeat(f"Saved bootstrap coefs → {outb}")
else:
    heartbeat('Skipping bootstrap — required columns missing.')

[21:18:15] Skipping bootstrap — required columns missing.


## 8) QA Summary

In [19]:
qa = {
  'timestamp_utc': '2025-09-01T18:23:49.464537Z',
  'n_rows_merged': int(len(merged)) if len(merged)>0 else 0,
  'et_cols': ET_COLS if 'ET_COLS' in globals() else [],
  'eeg_cols': EEG_COLS if 'EEG_COLS' in globals() else [],
  'kec_cols': KEC_COLS if 'KEC_COLS' in globals() else [],
  'reading_coeffs_csv': '../data/processed/models_reading_coeffs.csv' if Path('../data/processed/models_reading_coeffs.csv').exists() else None,
  'reading_coeffs_fdr_csv': '../data/processed/models_reading_coeffs_fdr.csv' if Path('../data/processed/models_reading_coeffs_fdr.csv').exists() else None,
  'boot_ffd_entropy': '../data/processed/boot_ols_duration_entropy.csv' if Path('../data/processed/boot_ols_duration_entropy.csv').exists() else None,
  'fig_F2': '../figures/metrics/F2_reading_vs_KEC.png' if Path('../figures/metrics/F2_reading_vs_KEC.png').exists() else None,
  'fig_F3': '../figures/metrics/F3_EEG_vs_KEC.png' if Path('../figures/metrics/F3_EEG_vs_KEC.png').exists() else None
}
with open('../reports/qa_kec_models.json', 'w') as f:
    json.dump(qa, f, indent=2)
heartbeat("Saved QA JSON → ../reports/qa_kec_models.json")

[21:18:15] Saved QA JSON → ../reports/qa_kec_models.json


### Notes
- Extend to GD, TRT (`log_TRT`), GPT (`log_GPT`) and other EEG bands as needed.
- Prefer MixedLM with random intercepts for `Subject` and `SentenceID` for confirmatory models.
- Keep seeds fixed; increase bootstrap `B` for narrower CIs (runtime ↑).
