In [None]:
# CSF Biomarker Baseline Pipeline (CogNID-style)
# ------------------------------------------------
# Run top-to-bottom. No CLI args.
# What this does:
# - Visit priority baseline selection: bl/init > sc > m03 > m06 > m12 > m24 > later
# - One row per PTID
# - Drop rows with empty Diagnosis (target)
# - Compute CSF ratios (Tau/Aβ42, pTau/Aβ42, Aβ42/Aβ40 if present)
# - Save baseline Excel (labeled-only)
# - EDA plots (inline + saved)
# - Hybrid imputation: MICE (continuous) + Mode (categorical/binary)
# - Save single final imputed Excel


In [None]:
# Imports, helpers, tokens
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# inline plots toggle
%matplotlib inline
SHOW_PLOTS = True

# Visit priority (CogNID-style)
VISIT_PRIORITY = {
    "bl": 1, "init": 1,
    "sc": 2, "screening": 2,
    "m03": 3, "month3": 3, "3m": 3,
    "m06": 4, "month6": 4, "6m": 4,
    "m12": 5, "month12": 5, "12m": 5,
    "m24": 6, "month24": 6, "24m": 6,
}

# Mappings from earlier pipeline
GENDER_MAP = {1: "male", 2: "female", "1": "male", "2": "female"}
DIAG_MAP   = {1: "CN",   2: "MCI",    3: "DEMENTIA", "1": "CN", "2": "MCI", "3": "DEMENTIA"}

# Column finder tokens
PTID_TOKENS   = ["ptid", "subjectid", "subject_id", "participantid", "participant_id"]
VISIT_TOKENS  = ["visit", "visist", "viscode", "viscode2"]
GENDER_TOKENS = ["gender"]
DIAG_TOKENS   = ["diagnosis", "diagnoses", "diag", "dx", "dx_bl", "dxchange"]
AGE_TOKENS    = ["entry_age", "age", "ptage", "baselineage"]

# CSF tokens
TAU_TOKENS     = ["total tau", "t-tau", "ttau", "tau pg/ml", "tau (", " tau ", " tau"]
PTAU_TOKENS    = ["phospho tau", "p-tau", "ptau", "ptau pg/ml", "ptau (", " p-tau"]
ABETA42_TOKENS = ["a beta 142", "abeta 1-42", "abeta42", "aβ42", "aβ42", "abeta 42", "a beta42"]
ABETA40_TOKENS = ["abeta40", "aβ40", "aβ40", "a beta 140", "abeta 40", "a beta40"]

# Helpers
def normalize_colnames(cols):
    def norm(c):
        c2 = str(c).strip()
        c2 = re.sub(r"\s+", " ", c2)
        return c2
    return [norm(c) for c in cols]

def find_exact_col(df, candidate_keys):
    for c in df.columns:
        lc = c.lower().replace(" ", "")
        for cand in candidate_keys:
            if lc == cand:
                return c
    return None

def find_contains_col(df, token_list):
    for col in df.columns:
        lc = col.lower()
        for tok in token_list:
            if tok in lc:
                return col
    return None

def parse_visit_priority(raw):
    if pd.isna(raw):
        return 10_000
    s = str(raw).strip().lower().replace(" ", "")
    if s in VISIT_PRIORITY:
        return VISIT_PRIORITY[s]
    m = re.match(r"m(\d+)", s)
    if m:
        try:
            months = int(m.group(1))
            base = 7
            return base + months
        except Exception:
            return 10_000
    v = re.match(r"v(\d+)", s)
    if v:
        return 9_000 + int(v.group(1))
    return 10_000

def drop_empty_columns(df: pd.DataFrame, keep_cols=None, min_non_null=1):
    keep_cols = [k for k in (keep_cols or []) if k is not None]
    drop = [c for c in df.columns if c not in keep_cols and df[c].notna().sum() < min_non_null]
    return df.drop(columns=drop), drop

def standardize_yes_no(df: pd.DataFrame, yes_tokens=("y","yes","1"), no_tokens=("n","no","0")):
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == object:
            lc = out[c].astype(str).str.strip().str.lower()
            mask_yes = lc.isin(yes_tokens)
            mask_no  = lc.isin(no_tokens)
            out.loc[mask_yes, c] = "Yes"
            out.loc[mask_no,  c] = "No"
    return out

def safe_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>| ]+', "_", str(name))

# Simple plotting helpers (matplotlib only)
def plot_bar_counts(series, title, out_png):
    plt.figure()
    series.value_counts(dropna=False).plot(kind="bar", title=title)
    plt.tight_layout(); plt.savefig(out_png, dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()

def boxplot_by_diag(df, feat_col, diag_col, out_png):
    plt.figure()
    ok = False
    if feat_col and diag_col and df[feat_col].notna().sum() > 0:
        groups, labels = [], []
        for dlab in df[diag_col].dropna().unique():
            vals = pd.to_numeric(df.loc[df[diag_col] == dlab, feat_col], errors="coerce").dropna().values
            if len(vals) > 0:
                groups.append(vals); labels.append(str(dlab))
        if groups:
            plt.boxplot(groups, labels=labels)
            plt.title(f"{feat_col} by {diag_col}")
            plt.xlabel(diag_col); plt.ylabel(feat_col)
            ok = True
    if not ok:
        plt.title(f"{feat_col} by {diag_col} (no data)")
    plt.tight_layout(); plt.savefig(out_png, dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()


In [None]:
# Paths (EDIT THESE to your machine layout)
from pathlib import Path
import os

# Set project root, then input/out paths
PROJECT_ROOT = Path("/Users/madhurabn/Desktop/adni")  # <-- change if needed
input_path   = PROJECT_ROOT / "data" / "raw" / "CSF biomarkers ADNI.xlsx"
outdir       = PROJECT_ROOT / "data" / "processed"
plots_dir    = outdir / "plots" / "csf"

outdir.mkdir(parents=True, exist_ok=True)
plots_dir.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Input:", input_path)
print("Outdir:", outdir)
print("Plots:", plots_dir)

if not input_path.exists():
    # mild auto-discovery
    candidates = list((PROJECT_ROOT / "data" / "raw").rglob("*CSF*ADNI*.xlsx"))
    if candidates:
        input_path = candidates[0]
        print("Using discovered:", input_path)
    else:
        raise FileNotFoundError("CSF biomarkers Excel not found. Update input_path.")


In [None]:
# Load and baseline-filter (one row per PTID) + tidy + mappings + drop unlabeled Diagnosis
xl = pd.ExcelFile(input_path)
df = xl.parse(xl.sheet_names[0])
df.columns = normalize_colnames(df.columns)

ptid_col  = find_exact_col(df, PTID_TOKENS)
visit_col = find_exact_col(df, VISIT_TOKENS)
if ptid_col is None or visit_col is None:
    raise ValueError(f"Required columns not found. PTID={ptid_col}, VISIT={visit_col}\nGot: {list(df.columns)}")

gender_col = find_contains_col(df, GENDER_TOKENS)
diag_col   = find_contains_col(df, DIAG_TOKENS)
age_col    = find_contains_col(df, AGE_TOKENS)

# apply visit priority → keep earliest per PTID
work = df.copy()
work["_visit_priority"] = work[visit_col].apply(parse_visit_priority)
work_sorted = work.sort_values(by=["_visit_priority"]).copy()
baseline = work_sorted.drop_duplicates(subset=[ptid_col], keep="first").copy()
baseline.drop(columns=["_visit_priority"], inplace=True)

# light tidy
baseline, dropped_empty = drop_empty_columns(baseline, keep_cols=[ptid_col, visit_col, gender_col, diag_col])
baseline = standardize_yes_no(baseline)

# map codes (gender, diagnosis)
if gender_col:
    baseline[gender_col] = baseline[gender_col].map(lambda x: GENDER_MAP.get(x, x))
if diag_col:
    baseline[diag_col] = baseline[diag_col].map(lambda x: DIAG_MAP.get(x, x))

# drop rows with empty Diagnosis BEFORE saving/imputation
if diag_col is None:
    raise ValueError("Diagnosis column not detected. Merge/rename so a diagnosis-like column exists before this step.")

baseline[diag_col] = baseline[diag_col].replace(["", " ", "NA", "N/A", "nan", "NaN"], pd.NA)
n0 = len(baseline)
baseline = baseline.dropna(subset=[diag_col]).copy()
print(f"Dropped {n0 - len(baseline)} rows with missing {diag_col}. Kept: {len(baseline)}")

# quick summary
print("\n=== CSF Baseline (labeled) summary ===")
print("Shape:", baseline.shape)
miss_total = int(baseline.isna().sum().sum())
print("Total missing cells:", miss_total)
vc_diag = baseline[diag_col].value_counts(dropna=False)
print("\nDiagnosis counts:\n", vc_diag)
if gender_col:
    print("\nGender counts:\n", baseline[gender_col].value_counts(dropna=False))
if age_col:
    age_series = pd.to_numeric(baseline[age_col], errors="coerce")
    print("\nAge (numeric) describe:\n", age_series.describe())


In [None]:
# Detect CSF columns and compute ratios if present

def find_col(df, tokens):
    return find_contains_col(df, tokens)

col_tau   = find_col(baseline, TAU_TOKENS)
col_ptau  = find_col(baseline, PTAU_TOKENS)
col_ab42  = find_col(baseline, ABETA42_TOKENS)
col_ab40  = find_col(baseline, ABETA40_TOKENS)

print("Detected:")
print(" - Total Tau:", col_tau)
print(" - p-Tau:", col_ptau)
print(" - Aβ42:", col_ab42)
print(" - Aβ40:", col_ab40)

# Create ratios safely
if col_tau and col_ab42:
    baseline["Tau_over_Abeta42"] = pd.to_numeric(baseline[col_tau], errors="coerce") / pd.to_numeric(baseline[col_ab42], errors="coerce")
if col_ptau and col_ab42:
    baseline["pTau_over_Abeta42"] = pd.to_numeric(baseline[col_ptau], errors="coerce") / pd.to_numeric(baseline[col_ab42], errors="coerce")
if col_ab42 and col_ab40:
    baseline["Abeta42_over_Abeta40"] = pd.to_numeric(baseline[col_ab42], errors="coerce") / pd.to_numeric(baseline[col_ab40], errors="coerce")

# Save labeled baseline with ratios
baseline_xlsx = (outdir / "clinical_csf_baseline.xlsx")
baseline.to_excel(baseline_xlsx, index=False)
print("\nSaved CSF baseline:", baseline_xlsx)


In [None]:
# EDA plots (inline + saved)
# 1) Age histogram (if age exists)
if 'age_col' in globals() and age_col:
    series = pd.to_numeric(baseline[age_col], errors="coerce").dropna()
    if not series.empty:
        plt.figure(); series.plot(kind="hist", bins=30, title=f"Histogram of {age_col}")
        plt.xlabel(age_col); plt.tight_layout(); plt.savefig(plots_dir / "csf_age_hist.png", dpi=150)
        if SHOW_PLOTS: plt.show()
        plt.close()

# 2) Diagnosis distribution
plot_bar_counts(baseline[diag_col], f"Diagnosis distribution ({diag_col})", plots_dir / "csf_diag_bar.png")

# 3) Gender distribution
if 'gender_col' in globals() and gender_col:
    plot_bar_counts(baseline[gender_col], f"Gender distribution ({gender_col})", plots_dir / "csf_gender_bar.png")

# 4) CSF boxplots by diagnosis (raw biomarkers & ratios if available)
for feat, label in [
    (globals().get('col_tau', None), 'Total Tau'),
    (globals().get('col_ptau', None), 'p-Tau'),
    (globals().get('col_ab42', None), 'Aβ42'),
    (globals().get('col_ab40', None), 'Aβ40'),
    ('Tau_over_Abeta42', 'Tau/Aβ42'),
    ('pTau_over_Abeta42', 'pTau/Aβ42'),
    ('Abeta42_over_Abeta40', 'Aβ42/Aβ40'),
]:
    if feat and feat in baseline.columns:
        out_name = f"csf_box_{safe_filename(label)}.png"
        boxplot_by_diag(baseline, feat, diag_col, plots_dir / out_name)

# 5) Missingness heatmap
plt.figure(figsize=(8, 6))
msk = baseline.isna()
plt.imshow(msk.values, aspect="auto", interpolation="nearest")
plt.title("CSF Missingness heatmap (white = missing)")
plt.xlabel("Columns"); plt.ylabel("Rows")
plt.colorbar(); plt.tight_layout(); plt.savefig(plots_dir / "csf_missingness_heatmap.png", dpi=150)
if SHOW_PLOTS: plt.show()
plt.close()


In [None]:
# Hybrid imputation: MICE (continuous) + Mode (categorical/binary) -> single output
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer, SimpleImputer

# copies
df_before = baseline.copy()
df_after  = baseline.copy()

# detect numeric vs categorical/binary
obj_like = df_after.select_dtypes(include=['object', 'category']).columns.tolist()
num_like = df_after.select_dtypes(include=[np.number]).columns.tolist()

# helper to detect binary
def _is_binary_series(s: pd.Series) -> bool:
    vals = s.dropna().unique()
    if len(vals) == 0: return False
    if pd.api.types.is_numeric_dtype(s):
        try:
            return set(pd.Series(vals).astype(float).astype(int)).issubset({0,1})
        except Exception:
            return False
    low = pd.Series(vals).astype(str).str.strip().str.lower()
    return low.isin({"yes","no","y","n","true","false","0","1"}).all()

ptid_key = globals().get('ptid_col', None)
diag_key = globals().get('diag_col', None)

binary_cols_num = [c for c in num_like if c not in {ptid_key, diag_key} and _is_binary_series(df_after[c])]
cont_cols = [c for c in num_like if c not in {ptid_key, diag_key} and c not in binary_cols_num]
cat_cols = [c for c in obj_like if c not in {ptid_key, diag_key}]

# impute categorical/binary with most_frequent
cat_bin_to_impute = [c for c in cat_cols + binary_cols_num if df_after[c].isna().any()]
if cat_bin_to_impute:
    mode_imp = SimpleImputer(strategy="most_frequent")
    df_after[cat_bin_to_impute] = mode_imp.fit_transform(df_after[cat_bin_to_impute])

# impute continuous with MICE
if cont_cols:
    mice = IterativeImputer(random_state=42, sample_posterior=False, max_iter=10, initial_strategy="median")
    df_after[cont_cols] = mice.fit_transform(df_after[cont_cols])

# enforce binary back to 0/1
for c in binary_cols_num:
    df_after[c] = pd.to_numeric(df_after[c], errors="coerce").round().clip(0,1).astype("Int64").astype(int)

# save single output
imputed_path = outdir / "clinical_csf_imputed.xlsx"
df_after.to_excel(imputed_path, index=False)

# summary
print("=== CSF Imputation Summary (MICE continuous + Mode categorical) ===")
print("Rows:", df_after.shape[0], " Columns:", df_after.shape[1])
print("Missing before:", int(df_before.isna().sum().sum()))
print("Missing after :", int(df_after.isna().sum().sum()))

# simple missingness bars
for tag, d in [("before", df_before), ("after", df_after)]:
    plt.figure(figsize=(10,4))
    d.isna().sum().sort_values(ascending=False).plot(kind="bar", title=f"CSF Missing per column ({tag.upper()})")
    plt.tight_layout(); plt.savefig(plots_dir / f"csf_missing_bar_{tag}.png", dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()

# distribution compare for a few continuous cols
for c in cont_cols[:5]:
    plt.figure(figsize=(6,4))
    plt.hist(df_before[c].dropna(), bins=30, alpha=0.5, label="Before", density=True)
    plt.hist(df_after[c].dropna(),  bins=30, alpha=0.5, label="After",  density=True)
    plt.title(f"CSF Distribution before vs after imputation: {c}")
    plt.legend(); plt.tight_layout()
    plt.savefig(plots_dir / f"csf_dist_compare_{safe_filename(c)}.png", dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()

# missingness heatmaps
fig, axes = plt.subplots(1, 2, figsize=(12,6))
axes[0].imshow(df_before.isna(), aspect="auto"); axes[0].set_title("Missingness (before)")
axes[1].imshow(df_after.isna(),  aspect="auto"); axes[1].set_title("Missingness (after)")
for ax in axes: ax.set_xlabel("Columns"); ax.set_ylabel("Rows")
plt.tight_layout(); plt.savefig(plots_dir / "csf_missingness_heatmaps.png", dpi=150)
if SHOW_PLOTS: plt.show()
plt.close()

print("\nSaved baseline:", (outdir / "clinical_csf_baseline.xlsx"))
print("Saved final imputed file:", imputed_path)
print("Saved plots in:", plots_dir)
