# Setup

In [24]:
from pathlib import Path
import os, io, re, csv, json, time, requests
import pandas as pd
import numpy as np

# Local folders (kept out of Git if ".gitignore" includes "data/")
DATA = Path("data"); RAW = DATA/"raw"; INTERIM = DATA/"interim"
RAW.mkdir(parents=True, exist_ok=True); INTERIM.mkdir(parents=True, exist_ok=True)

# Base URL (use HTTP, not HTTPS, to avoid host certificate mismatch on this server)
BASE = "http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/"

# Pheno releases you want to pull
PHENO_FILES = [
    "HBN_R1_1_Pheno.csv", "HBN_R2_1_Pheno.csv", "HBN_R3_Pheno.csv", "HBN_R4_Pheno.csv",
    "HBN_R5_Pheno.csv",   "HBN_R6_Pheno.csv",   "HBN_R7_Pheno.csv", "HBN_R8_Pheno.csv",
    "HBN_R9_Pheno.csv",   "HBN_R10_Pheno.csv",  "HBN_R11_Pheno.csv"
]

DIAG_FILE = "Diagnosis_ClinicianConsensus.csv"

# HTTP fetch helper + caching

In [25]:
def http_text(url: str, timeout: int = 60) -> str:
    """GET text from URL (force http on this host)."""
    if url.startswith("https://fcon_1000.projects.nitrc.org"):
        url = url.replace("https://", "http://", 1)
    r = requests.get(url, timeout=timeout)
    r.raise_for_status()
    return r.text

def read_table_smart(url: str) -> pd.DataFrame:
    """Download a delimited text file and read with a sniffed separator."""
    text = http_text(url)
    sample = text[:5000]
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=[",",";","\t","|"])
        sep = dialect.delimiter
    except Exception:
        sep = max([",",";","\t","|"], key=sample.count)
    df = pd.read_csv(io.StringIO(text), sep=sep, engine="python")
    df.columns = [c.strip() for c in df.columns]
    return df

def save_raw(df: pd.DataFrame, name: str):
    path = RAW / name
    df.to_csv(path, index=False)
    return path

def norm_eid(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().upper()
    s = re.sub(r"[^A-Z0-9]", "", s)  # remove spaces/dashes
    return s or np.nan

def release_rank_from_filename(name: str) -> float:
    """
    'HBN_R1_1_Pheno.csv' -> 1.1 , 'HBN_R10_Pheno.csv' -> 10.0, 'HBN_R11_Pheno.csv' -> 11.0
    """
    m = re.search(r"_R(\d+)(?:_(\d+))?_Pheno\.csv$", name)
    if not m:
        return 0.0
    major = int(m.group(1))
    minor = int(m.group(2)) if m.group(2) else 0
    return float(f"{major}.{minor}")

In [26]:
pheno_frames = []
for fname in PHENO_FILES:
    url = BASE + fname
    try:
        df = read_table_smart(url)
        # cache the exact download
        save_raw(df, fname)
        # add bookkeeping columns
        df["_release_file"] = fname
        df["_release_rank"] = release_rank_from_filename(fname)
        # normalize EID (prefer explicit 'EID' column, else try to infer)
        if "EID" in df.columns:
            df["_EID"] = df["EID"].map(norm_eid)
        else:
            # try to guess any ID-like column
            idcol = None
            for c in df.columns:
                if re.fullmatch(r"(participant_)?eid", c, flags=re.I):
                    idcol = c; break
            if idcol:
                df["_EID"] = df[idcol].map(norm_eid)
            else:
                df["_EID"] = np.nan
        pheno_frames.append(df)
        print(f"Loaded {fname}: {df.shape}")
    except Exception as e:
        print(f"WARNING: failed to load {fname}: {e}")

# Concatenate all releases
pheno_all = pd.concat(pheno_frames, ignore_index=True)
print("pheno_all:", pheno_all.shape)

# Keep the **latest** row per participant (by _release_rank)
pheno_all = pheno_all[pheno_all["_EID"].notna()]
pheno_latest = (pheno_all.sort_values(["_EID","_release_rank"])
                          .drop_duplicates("_EID", keep="last"))
print("pheno_latest (one row per EID):", pheno_latest.shape)

# Save concatenated & latest
save_raw(pheno_all,   "HBN_pheno_all_concat.csv")
save_raw(pheno_latest.drop(columns=["_release_file","_release_rank"]), "HBN_pheno_latest.csv")

Loaded HBN_R1_1_Pheno.csv: (797, 9)
Loaded HBN_R2_1_Pheno.csv: (256, 9)
Loaded HBN_R3_Pheno.csv: (317, 9)
Loaded HBN_R4_Pheno.csv: (558, 9)
Loaded HBN_R5_Pheno.csv: (391, 9)
Loaded HBN_R6_Pheno.csv: (336, 9)
Loaded HBN_R7_Pheno.csv: (692, 9)
Loaded HBN_R8_Pheno.csv: (470, 9)
Loaded HBN_R9_Pheno.csv: (422, 9)
Loaded HBN_R10_Pheno.csv: (847, 9)
Loaded HBN_R11_Pheno.csv: (1160, 9)
pheno_all: (6246, 10)
pheno_latest (one row per EID): (3432, 10)


PosixPath('data/raw/HBN_pheno_latest.csv')

## Download & parse Diagnosis_ClinicianConsensus.csv, extract EID from “Identifiers”

In [27]:
# Raw diagnosis table
diag = read_table_smart(BASE + DIAG_FILE)
save_raw(diag, DIAG_FILE)

# Build a set of known EIDs from pheno_latest to help disambiguate tokens
eid_set = set(pheno_latest["_EID"].dropna().unique())

# Extract EID candidates from "Identifiers"
if "Identifiers" not in diag.columns:
    raise RuntimeError("Diagnosis table has no 'Identifiers' column. Inspect columns: %s" % diag.columns.tolist())

def extract_eid_from_identifiers(val, candidates):
    if pd.isna(val): return np.nan
    s = str(val).upper()
    # split on common delimiters and whitespace
    tokens = re.split(r"[;,\|\s]+", s)
    # first try exact matches to known EIDs
    for t in tokens:
        tnorm = norm_eid(t)
        if tnorm in candidates:
            return tnorm
    # fallback regex for HBN-like tokens
    m = re.search(r"\bHBN[A-Z0-9]+\b", s)
    if m:
        tnorm = norm_eid(m.group(0))
        if tnorm in candidates:
            return tnorm
    return np.nan

diag = diag.copy()
diag["_EID"] = diag["Identifiers"].apply(lambda v: extract_eid_from_identifiers(v, eid_set))
diag_keyed = diag.dropna(subset=["_EID"]).drop_duplicates("_EID")
print("Diagnosis rows with resolvable EID:", diag_keyed.shape[0], "of", diag.shape[0])

Diagnosis rows with resolvable EID: 2221 of 2569


# Merge latest pheno with diagnosis (inner join on EID)

In [28]:
merged = pheno_latest.merge(diag_keyed, on="_EID", how="inner", suffixes=("_pheno","_dx"))
print("Merged shape:", merged.shape)

# Quick peek
display(merged[["_EID","Sex","Age"] + [c for c in merged.columns if "DX_" in c][:6]].head())

Merged shape: (2221, 174)


Unnamed: 0,_EID,Sex,Age,"Diagnosis_ClinicianConsensus,DX_01","Diagnosis_ClinicianConsensus,DX_01_ByHx","Diagnosis_ClinicianConsensus,DX_01_Cat","Diagnosis_ClinicianConsensus,DX_01_Code","Diagnosis_ClinicianConsensus,DX_01_Confirmed","Diagnosis_ClinicianConsensus,DX_01_New"
0,NDARAA075AMK,1.0,6.72804,No Diagnosis Given,0.0,No Diagnosis Given,No Diagnosis Given,,0.0
1,NDARAA112DMH,0.0,5.545744,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,0.0
2,NDARAA117NEJ,0.0,7.475929,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,1.0
3,NDARAA536PTU,0.0,11.998402,ADHD-Inattentive Type,0.0,Neurodevelopmental Disorders,F90.0,,0.0
4,NDARAA948VFH,1.0,7.98266,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,0.0


In [29]:
merged_path = INTERIM/"HBN_pheno_latest__with_diagnosis.csv"
merged.to_csv(merged_path, index=False)

manifest = {
    "saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "pheno_sources": [
        {"file": f, "url": BASE+f, "release_rank": release_rank_from_filename(f)} for f in PHENO_FILES
    ],
    "diagnosis_source": {"file": DIAG_FILE, "url": BASE+DIAG_FILE},
    "outputs": {
        "pheno_all_concat": str(RAW/"HBN_pheno_all_concat.csv"),
        "pheno_latest": str(RAW/"HBN_pheno_latest.csv"),
        "merged": str(merged_path),
    }
}
with open(RAW/"MANIFEST.json","w") as f:
    json.dump(manifest, f, indent=2)

manifest

{'saved_at': '2025-11-06 15:11:08',
 'pheno_sources': [{'file': 'HBN_R1_1_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R1_1_Pheno.csv',
   'release_rank': 1.1},
  {'file': 'HBN_R2_1_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R2_1_Pheno.csv',
   'release_rank': 2.1},
  {'file': 'HBN_R3_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R3_Pheno.csv',
   'release_rank': 3.0},
  {'file': 'HBN_R4_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R4_Pheno.csv',
   'release_rank': 4.0},
  {'file': 'HBN_R5_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/HBN_R5_Pheno.csv',
   'release_rank': 5.0},
  {'file': 'HBN_R6_Pheno.csv',
   'url': 'http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/