In [1]:
import os, pandas as pd
from src.io_utils import load_public_hbn

# (optional) set env var in-notebook if it isn't set in your shell
if "HBN_PUBLIC_CSV_URL" not in os.environ:
    os.environ["HBN_PUBLIC_CSV_URL"] = "http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R11_Pheno.csv"

df = load_public_hbn()
df.head(), df.shape

(         ï»¿EID  Sex        Age  EHQ_Total Commercial_Use Full_Pheno
 0  NDARAA773LUW    1  13.638945     100.00            Yes        Yes
 1  NDARAA940JHB    0  12.576089      93.34            Yes        Yes
 2  NDARAB282FDJ    1  11.500570      38.94            Yes        Yes
 3  NDARAB678VYW    0  20.181724     100.00            Yes        Yes
 4  NDARAC973ENV    0   5.818959     -84.47            Yes        Yes,
 (1160, 6))

In [3]:
import numpy as np, re

df = df.copy()
lower_map = {c.lower(): c for c in df.columns}

def pick(colnames):
    for name in colnames:
        if name in lower_map:
            return lower_map[name]
    return None

# Try common patterns seen in HBN tables
age_col = pick(["age","age_years","interview_age","age_in_years","age at assessment","age_years_assessed"])
sex_col = pick(["sex","gender","biological_sex","participantsex"])

print("Detected:", age_col, sex_col)

# Build feature frame
fx = pd.DataFrame(index=df.index)

# --- AGE ---
if age_col:
    age = pd.to_numeric(df[age_col], errors="coerce")
    # Occasionally age is in months or days; if you detect huge numbers, rescale
    if age.median() > 120:      # looks like days
        age = age / 365.25
    elif age.median() > 20*12:  # looks like months
        age = age / 12.0
    fx["age"] = age

# --- SEX ---
if sex_col:
    s = df[sex_col].astype(str).str.strip().str.lower()
    fx["sex"] = s.map({
        "m":1,"male":1,"1":1,"boy":1,
        "f":0,"female":0,"0":0,"girl":0
    }).fillna(np.nan)

fx.head(), fx.isna().mean()

Detected: Age Sex


(         age  sex
 0  13.638945    1
 1  12.576089    0
 2  11.500570    1
 3  20.181724    0
 4   5.818959    0,
 age    0.0
 sex    0.0
 dtype: float64)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(fx)  # no warning now

# clip 1st–99th pct per column
q01 = np.nanpercentile(X_imp, 1, axis=0)
q99 = np.nanpercentile(X_imp, 99, axis=0)
X_clip = np.clip(X_imp, q01, q99)

scaler = StandardScaler()
Xz = scaler.fit_transform(X_clip)

In [5]:
url2 = "http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R1_1_Pheno.csv"
df2 = pd.read_csv(url2)

# Pick an ID column (first column often works; adjust once you inspect)
id1 = df.columns[0]
id2 = df2.columns[0]

merged = pd.merge(df, df2, left_on=id1, right_on=id2, how="outer", suffixes=("_r11","_r1"))
merged.shape

(1955, 12)

In [8]:
import pathlib

# 1) Make a tidy folder layout
base = pathlib.Path("data")
raw = base / "raw"
interim = base / "interim"
raw.mkdir(parents=True, exist_ok=True)
interim.mkdir(parents=True, exist_ok=True)

# 2) Save the raw downloads
df.to_csv(raw/"hbn_r11_pheno.csv", index=False)
df2.to_csv(raw/"hbn_r1_1_pheno.csv", index=False)

# 3) Save the merged table
merged.to_csv(interim/"hbn_basic_merged.csv", index=False)

# 4) Verify
print("Raw:", os.listdir(raw))
print("Interim:", os.listdir(interim))

Raw: ['hbn_r11_pheno.csv', 'hbn_r1_1_pheno.csv']
Interim: ['hbn_basic_merged.csv']
