In [1]:
import os, pandas as pd
from src.io_utils import load_public_hbn

# (optional) set env var in-notebook if it isn't set in your shell
if "HBN_PUBLIC_CSV_URL" not in os.environ:
    os.environ["HBN_PUBLIC_CSV_URL"] = "http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R11_Pheno.csv"

df = load_public_hbn()
df.head(), df.shape

(         ï»¿EID  Sex        Age  EHQ_Total Commercial_Use Full_Pheno
 0  NDARAA773LUW    1  13.638945     100.00            Yes        Yes
 1  NDARAA940JHB    0  12.576089      93.34            Yes        Yes
 2  NDARAB282FDJ    1  11.500570      38.94            Yes        Yes
 3  NDARAB678VYW    0  20.181724     100.00            Yes        Yes
 4  NDARAC973ENV    0   5.818959     -84.47            Yes        Yes,
 (1160, 6))

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# pick a few available columns; adjust after you inspect df.columns
keep = [c for c in df.columns if c.lower() in {"age","sex"}]
fx = df[keep].copy()

# coerce numeric and simple encoding
fx["age"] = pd.to_numeric(fx.get("age", pd.Series([np.nan]*len(fx))), errors="coerce")
if "sex" in fx:
    fx["sex"] = (fx["sex"].astype(str).str[0].str.upper().map({"M":1,"F":0,"1":1,"0":0}))

imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(fx)

q01 = np.nanpercentile(X_imp, 1, axis=0)
q99 = np.nanpercentile(X_imp, 99, axis=0)
X_clip = np.clip(X_imp, q01, q99)

scaler = StandardScaler()
Xz = scaler.fit_transform(X_clip)

Xz[:3], fx.columns.tolist()



(array([[ 1.25837949,  0.98810308],
        [-0.79467284,  0.67260131],
        [ 1.25837949,  0.35334062]]),
 ['Sex', 'Age', 'age'])