# Setup: Import Libraries and Define Folders

In [1]:
# Import necessary libraries
from pathlib import Path
import os
import io
import re
import csv
import json
import time
import requests
import pandas as pd
import numpy as np

# Define folder structure for organizing data
# DATA folder contains all our data files
DATA = Path("data")
RAW = DATA / "raw"              # Raw downloaded files
INTERIM = DATA / "interim"      # Processed/merged files

# Create folders if they don't exist
RAW.mkdir(parents=True, exist_ok=True)
INTERIM.mkdir(parents=True, exist_ok=True)

# Base URL for downloading pheno files (using HTTP to avoid certificate issues)
BASE = "http://fcon_1000.projects.nitrc.org/indi/cmi_healthy_brain_network/File/_pheno/"

# List of all phenotype files we want to download (Releases 1-11)
PHENO_FILES = [
    "HBN_R1_1_Pheno.csv",
    "HBN_R2_1_Pheno.csv",
    "HBN_R3_Pheno.csv",
    "HBN_R4_Pheno.csv",
    "HBN_R5_Pheno.csv",
    "HBN_R6_Pheno.csv",
    "HBN_R7_Pheno.csv",
    "HBN_R8_Pheno.csv",
    "HBN_R9_Pheno.csv",
    "HBN_R10_Pheno.csv",
    "HBN_R11_Pheno.csv"
]

# Local diagnosis file (no longer downloading from API)
DIAG_FILE = "Diagnosis_ClinicianConsensus.csv"

# Helper Functions for Data Download and Processing

In [3]:
def http_text(url, timeout=60):
    """
    Download text content from a URL.
    
    Parameters:
    - url: The web address to download from
    - timeout: How long to wait before giving up (seconds)
    
    Returns: The text content from the URL
    """
    # Force HTTP instead of HTTPS for this specific server
    if url.startswith("https://fcon_1000.projects.nitrc.org"):
        url = url.replace("https://", "http://", 1)
    
    # Download the file
    response = requests.get(url, timeout=timeout)
    response.raise_for_status()  # Raise error if download failed
    return response.text


def read_table_smart(url):
    """
    Download a CSV/TSV file and automatically detect the separator.
    
    Figures out whether the file uses commas, tabs, semicolons, etc. to separate values.
    
    Parameters:
    - url: The web address of the file
    
    Returns: A pandas DataFrame
    """
    # Download the text content
    text = http_text(url)
    
    # Look at the first 5000 characters to guess the separator
    sample = text[:5000]
    
    try:
        # Try to automatically detect the delimiter
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample, delimiters=[",", ";", "\\t", "|"])
        separator = dialect.delimiter
    except Exception:
        # If automatic detection fails, use the most common delimiter
        separators = [",", ";", "\\t", "|"]
        separator = max(separators, key=sample.count)
    
    # Read the CSV into a pandas DataFrame
    df = pd.read_csv(io.StringIO(text), sep=separator, engine="python")
    
    # Clean up column names (remove extra spaces)
    df.columns = [col.strip() for col in df.columns]
    
    return df


def save_raw(dataframe, filename):
    """
    Save a DataFrame to the raw data folder.
    
    Parameters:
    - dataframe: The pandas DataFrame to save
    - filename: Name of the file (e.g., "data.csv")
    
    Returns: The full path where the file was saved
    """
    file_path = RAW / filename
    dataframe.to_csv(file_path, index=False)
    return file_path


def normalize_eid(value):
    """
    Clean and standardize participant EID (participant ID).
    
    Examples:
    - "NDAR AA075 AMK" → "NDARAA075AMK"
    - "ndar-aa112-dmh" → "NDARAA112DMH"
    
    Parameters:
    - value: The raw EID value
    
    Returns: Cleaned EID or NaN if invalid
    """
    # Handle missing values
    if pd.isna(value):
        return np.nan
    
    # Convert to string and uppercase
    cleaned = str(value).strip().upper()
    
    # Remove all non-alphanumeric characters (spaces, dashes, etc.)
    cleaned = re.sub(r"[^A-Z0-9]", "", cleaned)
    
    # Return NaN if empty after cleaning
    return cleaned if cleaned else np.nan


def get_release_number(filename):
    """
    Extract the release version number from a filename.
    
    Examples:
    - "HBN_R1_1_Pheno.csv" → 1.1
    - "HBN_R10_Pheno.csv" → 10.0
    - "HBN_R11_Pheno.csv" → 11.0
    
    This helps us track which release is newer when merging data.
    
    Parameters:
    - filename: Name of the pheno file
    
    Returns: Release number as a float
    """
    # Look for pattern like "_R10_" or "_R1_1_"
    match = re.search(r"_R(\\d+)(?:_(\\d+))?_Pheno\\.csv$", filename)
    
    if not match:
        return 0.0
    
    # Extract major version (e.g., 10)
    major = int(match.group(1))
    
    # Extract minor version if it exists (e.g., 1 in R1_1)
    minor = int(match.group(2)) if match.group(2) else 0
    
    # Combine into single number (e.g., 10.0 or 1.1)
    return float(f"{major}.{minor}")

# Download and Process All Phenotype Files

In [4]:
# This list will store all the downloaded phenotype data
pheno_frames = []

# Loop through each phenotype file and download it
for filename in PHENO_FILES:
    # Build the full URL
    url = BASE + filename
    
    try:
        print(f"Downloading {filename}...")
        
        # Download and read the file
        df = read_table_smart(url)
        
        # Save a copy to our raw data folder
        save_raw(df, filename)
        
        # Add tracking columns to remember which release this came from
        df["_release_file"] = filename
        df["_release_rank"] = get_release_number(filename)
        
        # Normalize the participant ID (EID)
        if "EID" in df.columns:
            # If there's an explicit EID column, use it
            df["_EID"] = df["EID"].apply(normalize_eid)
        else:
            # Otherwise, try to find an ID-like column
            id_column = None
            for col in df.columns:
                # Look for columns named like "eid" or "participant_eid"
                if re.fullmatch(r"(participant_)?eid", col, flags=re.I):
                    id_column = col
                    break
            
            if id_column:
                df["_EID"] = df[id_column].apply(normalize_eid)
            else:
                df["_EID"] = np.nan
        
        # Add this DataFrame to our collection
        pheno_frames.append(df)
        print(f"  ✓ Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
        
    except Exception as e:
        print(f"  ✗ WARNING: Failed to load {filename}: {e}")

print("\\n" + "="*60)

# Combine all releases into one big DataFrame
pheno_all = pd.concat(pheno_frames, ignore_index=True)
print(f"Combined all releases: {pheno_all.shape[0]} rows, {pheno_all.shape[1]} columns")

# Remove rows without a valid EID
pheno_all = pheno_all[pheno_all["_EID"].notna()]

# Keep only the LATEST row for each participant
# (If a participant appears in multiple releases, keep their newest data)
pheno_all_sorted = pheno_all.sort_values(["_EID", "_release_rank"])
pheno_latest = pheno_all_sorted.drop_duplicates("_EID", keep="last")

print(f"Latest data (one row per participant): {pheno_latest.shape[0]} participants")

# Save both versions
save_raw(pheno_all, "HBN_pheno_all_concat.csv")
save_raw(pheno_latest.drop(columns=["_release_file", "_release_rank"]), 
         "HBN_pheno_latest.csv")

print("\\n✓ Phenotype data saved successfully!")

Downloading HBN_R1_1_Pheno.csv...
  ✓ Loaded: 797 rows, 9 columns
Downloading HBN_R2_1_Pheno.csv...
  ✓ Loaded: 256 rows, 9 columns
Downloading HBN_R3_Pheno.csv...
  ✓ Loaded: 317 rows, 9 columns
Downloading HBN_R4_Pheno.csv...
  ✓ Loaded: 558 rows, 9 columns
Downloading HBN_R5_Pheno.csv...
  ✓ Loaded: 391 rows, 9 columns
Downloading HBN_R6_Pheno.csv...
  ✓ Loaded: 336 rows, 9 columns
Downloading HBN_R7_Pheno.csv...
  ✓ Loaded: 692 rows, 9 columns
Downloading HBN_R8_Pheno.csv...
  ✓ Loaded: 470 rows, 9 columns
Downloading HBN_R9_Pheno.csv...
  ✓ Loaded: 422 rows, 9 columns
Downloading HBN_R10_Pheno.csv...
  ✓ Loaded: 847 rows, 9 columns
Downloading HBN_R11_Pheno.csv...
  ✓ Loaded: 1160 rows, 9 columns
Combined all releases: 6246 rows, 10 columns
Latest data (one row per participant): 3432 participants
\n✓ Phenotype data saved successfully!


# Load Diagnosis File (LOCAL)

In [7]:
diag_path = RAW / DIAG_FILE
diag = pd.read_csv(diag_path)
print(f"✓ Loaded diagnosis file: {diag.shape[0]} rows, {diag.shape[1]} columns")

✓ Loaded diagnosis file: 4766 rows, 164 columns


  diag = pd.read_csv(diag_path)


In [11]:
# Create a set of known participant IDs for matching
known_eids = set(pheno_latest["_EID"].dropna().unique())
print(f"We have {len(known_eids)} known participant IDs from phenotype data")

def extract_eid_from_identifiers(identifier_value, known_ids):
    """
    Try to find a valid participant ID from the Identifiers field.
    
    The Identifiers field might contain multiple IDs or formatted text,
    so we need to parse it carefully.
    
    Parameters:
    - identifier_value: The raw value from the Identifiers column
    - known_ids: Set of valid participant IDs we already know about
    
    Returns: A clean participant ID or NaN if not found
    """
    # Handle missing values
    if pd.isna(identifier_value):
        return np.nan
    
    # Convert to uppercase string
    text = str(identifier_value).upper()
    
    # Split on common separators (semicolons, commas, pipes, spaces)
    tokens = re.split(r"[;,|\\s]+", text)
    
    # Strategy 1: Look for exact matches to known IDs
    for token in tokens:
        cleaned_token = normalize_eid(token)
        if cleaned_token in known_ids:
            return cleaned_token
    
    # Strategy 2: Look for HBN-style IDs using pattern matching
    match = re.search(r"\\bHBN[A-Z0-9]+\\b", text)
    if match:
        cleaned_token = normalize_eid(match.group(0))
        if cleaned_token in known_ids:
            return cleaned_token
    
    # If can't find a valid ID, return NaN
    return np.nan


# Extract EIDs from the diagnosis data
diag = diag.copy()  # Make a copy to avoid warnings
diag["_EID"] = diag["Identifiers"].apply(
    lambda val: extract_eid_from_identifiers(val, known_eids)
)

# Keep only rows where we successfully extracted an EID
diag_with_eid = diag[diag["_EID"].notna()]

# Remove duplicate rows (keep first occurrence)
diag_keyed = diag_with_eid.drop_duplicates("_EID")

print(f"✓ Successfully matched {diag_keyed.shape[0]} participants")
print(f"  (out of {diag.shape[0]} total diagnosis records)")

We have 3432 known participant IDs from phenotype data
✓ Successfully matched 3373 participants
  (out of 4766 total diagnosis records)


# Merge Phenotype and Diagnosis Data

In [19]:
# Merge the two datasets on participant ID (_EID)
# Use an "inner" join, which keeps only participants present in BOTH datasets
merged = pheno_latest.merge(
    diag_keyed,
    on="_EID",
    how="inner",
    suffixes=("_pheno", "_dx")  # Add suffixes if column names overlap
)

print(f"✓ Merged dataset: {merged.shape[0]} participants, {merged.shape[1]} columns")

# Preview the merged data
base_cols = ["_EID", "Sex", "Age"]
dx_cols = [c for c in merged.columns if "DX_" in c][:11]
preview_cols = [c for c in (base_cols + dx_cols) if c in merged.columns]
df_prev = merged.loc[:, preview_cols].head(11).copy()

if "Age" in df_prev.columns:
    df_prev["Age"] = pd.to_numeric(df_prev["Age"], errors="coerce")

display(df_prev.style.highlight_null())

✓ Merged dataset: 3373 participants, 174 columns


Unnamed: 0,_EID,Sex,Age,"Diagnosis_ClinicianConsensus,DX_01","Diagnosis_ClinicianConsensus,DX_01_ByHx","Diagnosis_ClinicianConsensus,DX_01_Cat","Diagnosis_ClinicianConsensus,DX_01_Code","Diagnosis_ClinicianConsensus,DX_01_Confirmed","Diagnosis_ClinicianConsensus,DX_01_New","Diagnosis_ClinicianConsensus,DX_01_PRem","Diagnosis_ClinicianConsensus,DX_01_Past_Doc","Diagnosis_ClinicianConsensus,DX_01_Presum","Diagnosis_ClinicianConsensus,DX_01_RC","Diagnosis_ClinicianConsensus,DX_01_Rem"
0,NDARAA075AMK,1.0,6.72804,No Diagnosis Given,0.0,No Diagnosis Given,No Diagnosis Given,,0.0,0.0,,,0,0.0
1,NDARAA112DMH,0.0,5.545744,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,0.0,0.0,,,0,0.0
2,NDARAA117NEJ,0.0,7.475929,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,1.0,0.0,,,0,0.0
3,NDARAA306NT2,1.0,21.216746,Generalized Anxiety Disorder,0.0,Anxiety Disorders,,1.0,,,,0.0,0,
4,NDARAA504CRN,1.0,9.165297,ADHD-Inattentive Type,0.0,Neurodevelopmental Disorders,F90.0,1.0,,,,0.0,0,
5,NDARAA536PTU,0.0,11.998402,ADHD-Inattentive Type,0.0,Neurodevelopmental Disorders,F90.0,,0.0,0.0,,,0,0.0
6,NDARAA947ZG5,0.0,13.62788,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,1.0,,,,0.0,0,
7,NDARAA948VFH,1.0,7.98266,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,,0.0,0.0,,,0,0.0
8,NDARAB055BPR,1.0,10.793862,ADHD-Combined Type,0.0,Neurodevelopmental Disorders,F90.2,1.0,,,,0.0,0,
9,NDARAB348EWR,0.0,5.805384,Other Specified Depressive Disorder,0.0,Depressive Disorders,F32.8,1.0,,,,0.0,0,


# Save Final Merged Dataset

In [22]:
merged_path = INTERIM / "HBN_pheno_with_diagnosis.csv"
merged.to_csv(merged_path, index=False)
print(f"\n✓ Saved merged dataset to: {merged_path}")

# Create a manifest file documenting what we did
manifest = {
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "description": "HBN phenotype data merged with clinical diagnosis",
    "pheno_sources": [
        {
            "file": filename,
            "url": BASE + filename,
            "release_version": get_release_number(filename)
        }
        for filename in PHENO_FILES
    ],
    "diagnosis_source": {
        "file": DIAG_FILE,
        "source": "local file (data/raw/)"
    },
    "outputs": {
        "pheno_all_concat": str(RAW / "HBN_pheno_all_concat.csv"),
        "pheno_latest": str(RAW / "HBN_pheno_latest.csv"),
        "merged_dataset": str(merged_path)
    },
    "record_counts": {
        "total_pheno_records": int(pheno_all.shape[0]),
        "unique_participants": int(pheno_latest.shape[0]),
        "diagnosis_records": int(diag.shape[0]),
        "matched_participants": int(diag_keyed.shape[0]),
        "final_merged": int(merged.shape[0])
    }
}

# Save manifest as JSON
manifest_path = RAW / "MANIFEST.json"
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)

print(f"✓ Saved processing manifest to: {manifest_path}")

# Display the manifest
print("\n" + "="*60)
print("PROCESSING SUMMARY")
print("="*60)
for key, value in manifest["record_counts"].items():
    print(f"  {key.replace('_', ' ').title()}: {value:,}")


✓ Saved merged dataset to: data/interim/HBN_pheno_with_diagnosis.csv
✓ Saved processing manifest to: data/raw/MANIFEST.json

PROCESSING SUMMARY
  Total Pheno Records: 4,239
  Unique Participants: 3,432
  Diagnosis Records: 4,766
  Matched Participants: 3,373
  Final Merged: 3,373
