# PCS‑HELIO v4.3 — 05 · Reading Analysis
OLS (clustered) and MixedLM; BH‑FDR; saves coeff tables and F2 figure.

In [None]:
from pathlib import Path; import pandas as pd, numpy as np, json, sys
# Robust import of shared fragments regardless of CWD
ROOT = Path.cwd()
if (ROOT/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent/'notebooks'/'_fragments.py').exists():
    sys.path.insert(0, str(ROOT.parent))
try:
    from notebooks._fragments import apply_style, preflight_checks, print_contract, qa_assertions, save_manifest
except Exception as e:
    print('[preflight] Failed importing notebooks._fragments:', e)
    def apply_style(): pass
    def preflight_checks(): pass
    def print_contract(): pass
    def qa_assertions(df, rules): pass
    def save_manifest(path, payload): Path(path).parent.mkdir(parents=True, exist_ok=True); Path(path).write_text(json.dumps(payload, indent=2))
from pcs_toolbox.analysis import fit_ols_clustered, fit_mixedlm, apply_fdr
apply_style(); preflight_checks(); print_contract()
BASE=Path('.') ; DATA=BASE/'data' ; PROC=DATA/'processed' ; RPTS=BASE/'reports' ; FIG=BASE/'figures'/'metrics'
PROC.mkdir(parents=True, exist_ok=True); RPTS.mkdir(parents=True, exist_ok=True); FIG.mkdir(parents=True, exist_ok=True)


In [None]:
# Load merged dataset (preferred) or fall back to ZuCo only
path_merged = PROC/'zuco_kec_merged.csv'
path_zuco   = PROC/'zuco_aligned.csv'
df = pd.read_csv(path_merged) if path_merged.exists() else (pd.read_csv(path_zuco) if path_zuco.exists() else pd.DataFrame())
# Engineer features
if not df.empty:
    if 'TRT' in df.columns:
        df['log_TRT'] = np.log1p(df['TRT'])
    if 'GPT' in df.columns:
        df['log_GPT'] = np.log1p(df['GPT'])
print('Rows:', len(df))
display(df.head(3))


In [None]:
# OLS across outcomes with KEC predictors (+ optional controls)
results = []
ols_df = pd.DataFrame()
if not df.empty:
    outcomes = [c for c in ['FFD','GD','log_TRT','log_GPT'] if c in df.columns]
    predictors = [c for c in ['entropy','curvature','coherence'] if c in df.columns]
    controls = [c for c in ['length','log_freq','w_pos'] if c in df.columns]
    if outcomes and predictors and 'Subject' in df.columns:
        for y in outcomes:
            rhs = predictors + controls
            formula = f"{y} ~ " + " + ".join(rhs)
            try:
                m = fit_ols_clustered(df.dropna(subset=[y]+rhs), formula, cluster='Subject')
                for term, est in m.params.items():
                    if term=='Intercept': continue
                    results.append({'outcome': y, 'term': term, 'estimate': float(est), 'p': float(m.pvalues.get(term, np.nan))})
            except Exception as e:
                print('[warn] OLS failed for', y, e)
ols_df = pd.DataFrame(results) if results else pd.DataFrame()
print(ols_df.head())
# Save coeffs and FDR
if not ols_df.empty:
    fdr_df = apply_fdr(ols_df, p_col='p', group_col='outcome', alpha=0.05)
    out_csv = PROC/'models_reading_coeffs.csv'
    out_fdr = PROC/'models_reading_coeffs_fdr.csv'
    ols_df.to_csv(out_csv, index=False)
    fdr_df.to_csv(out_fdr, index=False)
    save_manifest(RPTS/'models_reading_manifest.json', {'rows': int(len(ols_df)), 'outcomes': sorted(ols_df['outcome'].unique())})


In [None]:
# Save coeffs and FDR
from pcs_toolbox.analysis import apply_fdr
if not ols_df.empty:
    fdr_df = apply_fdr(ols_df, p_col='p', group_col='outcome', alpha=0.05)
    out_csv = PROC/'models_reading_coeffs.csv'
    out_fdr = PROC/'models_reading_coeffs_fdr.csv'
    ols_df.to_csv(out_csv, index=False)
    fdr_df.to_csv(out_fdr, index=False)
    save_manifest(RPTS/'models_reading_manifest.json', {'rows': int(len(ols_df)), 'outcomes': sorted(ols_df['outcome'].unique())})


In [None]:
# Validate H* after saving coefficients
from pathlib import Path
import os
NB_DIR = Path.cwd()
ROOT = NB_DIR if (NB_DIR/'tools').exists() else NB_DIR.parent
script = ROOT/'tools'/'validate_hstar.py'
os.chdir(ROOT)
os.system(f'python {script}')
p = ROOT/'reports'/'hstar_status.md'
print(p.read_text() if p.exists() else 'no hstar report')


In [None]:
# Figure F2: coefficients (robust plotting)
import matplotlib.pyplot as plt
from pathlib import Path as _P
_coeffs_path = PROC/'models_reading_coeffs.csv'
# Prefer in-memory results; else fall back to processed CSV
_has_results = not pd.DataFrame(results).empty
if _has_results:
    agg = pd.DataFrame(results)
elif _coeffs_path.exists():
    agg = pd.read_csv(_coeffs_path)
else:
    agg = pd.DataFrame()
if not agg.empty:
    # Focus outcome: prefer log_TRT if available
    outcomes = list(agg['outcome'].unique())
    focus = 'log_TRT' if 'log_TRT' in outcomes else outcomes[0]
    # Enforce term order and include near-zero bars
    terms = ['entropy','curvature','coherence']
    sub = (agg[agg['outcome']==focus]
             .set_index('term')['estimate']
             .reindex(terms))
    fig, ax = plt.subplots(figsize=(5,3))
    ax.axhline(0, color='gray', linewidth=0.8)
    sub.plot(kind='bar', ax=ax, edgecolor='black', linewidth=0.8)
    # Symmetric y-limits so near-zero shows as a thin bar
    smin, smax = float(sub.min() if pd.notna(sub.min()) else 0), float(sub.max() if pd.notna(sub.max()) else 0)
    span = max(abs(smin), abs(smax))
    if span < 1e-3:
        ax.set_ylim(-0.002, 0.002)
    else:
        ax.set_ylim(-(span*1.1), span*1.1)
    ax.set_title(f'F2 · OLS coefficients: {focus}')
    ax.set_ylabel('Estimate')
    fig.tight_layout(); fig.savefig(FIG/'F2_reading_coeffs.png', dpi=150); plt.close(fig)
else:
    print('[note] No coefficients to plot (results and CSV missing).')
# QA
if _has_results:
    qa_assertions(pd.DataFrame(results), {'min_rows': 3})
