In [None]:
# CSF Biomarker Baseline Pipeline — v2 (keeps original intact)
# -----------------------------------------------------------
# Run top-to-bottom. No CLI args.
# What this does:
# 1) Baseline selection using visit priority: bl/init > sc > m03 > m06 > m12 > m24 > later
# 2) One row per PTID (earliest visit kept)
# 3) NO dropping on Diagnosis (CSF file may not have labels). If a diagnosis column exists, some EDA plots are stratified by it.
# 4) Compute CSF ratios (Tau/Abeta42, pTau/Abeta42, Abeta42/Abeta40 if columns exist)
# 5) Save baseline Excel (with ratios): clinical_csf_baseline.xlsx
# 6) EDA after ratios: histogram/KDE of ratios, correlation heatmap (incl. ratios), optional scatter matrix; violin/box by diagnosis only if diagnosis exists
# 7) Imputation: MICE for numeric + IQR clipping (plausibility), mode for categorical/binary; save clinical_csf_imputed.xlsx
# 8) Plots saved under data/processed/plots/csf and also displayed inline


In [None]:
# Imports, helpers, tokens
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import seaborn as sns
    _HAS_SNS = True
    sns.set_context("notebook")
except Exception:
    _HAS_SNS = False

# inline plotting
%matplotlib inline
SHOW_PLOTS = True

# Visit priority (CogNID-style)
VISIT_PRIORITY = {
    "bl": 1, "init": 1,
    "sc": 2, "screening": 2,
    "m03": 3, "month3": 3, "3m": 3,
    "m06": 4, "month6": 4, "6m": 4,
    "m12": 5, "month12": 5, "12m": 5,
    "m24": 6, "month24": 6,
}

# tokens for auto-detection
PTID_TOKENS  = ["ptid", "subject", "subject_id", "rid"]
VISIT_TOKENS = ["visit", "visist", "phase", "timepoint", "timepoint_label"]
GENDER_TOKENS = ["gender", "sex"]
AGE_TOKENS = ["entry_age", "age", "ptage", "baselineage"]

# CSF biomarker tokens (flexible headers)
TAU_TOKENS    = ["total tau", "tau pg/ml", "t-tau", "ttau", "total_tau"]
PTAU_TOKENS   = ["phospho tau", "p-tau", "ptau", "p_tau"]
ABETA42_TOKENS= ["a beta 142", "abeta42", "aβ42", "abeta 42", "ab42", "a beta 1-42"]
ABETA40_TOKENS= ["abeta40", "aβ40", "ab40", "a beta 1-40"]

# helpers

def normalize_colnames(cols):
    out = []
    for c in cols:
        c2 = str(c).strip()
        c2 = re.sub(r"\s+", " ", c2)
        out.append(c2)
    return out

def find_exact_col(df, tokens):
    lcmap = {c.lower(): c for c in df.columns}
    for t in tokens:
        if t in lcmap:
            return lcmap[t]
    return None

def find_contains_col(df, tokens):
    for c in df.columns:
        lc = c.lower()
        for t in tokens:
            if t in lc:
                return c
    return None

def parse_visit_priority(v):
    if pd.isna(v):
        return 999
    s = str(v).strip().lower()
    return VISIT_PRIORITY.get(s, 999)

def drop_empty_columns(df, keep_cols=None):
    keep_cols = set([k for k in (keep_cols or []) if k])
    drop = []
    for c in df.columns:
        if c in keep_cols:
            continue
        if df[c].isna().all():
            drop.append(c)
    return df.drop(columns=drop), drop

def standardize_yes_no(df):
    for c in df.columns:
        if df[c].dtype == object:
            s = df[c].astype(str).str.strip().str.lower()
            mask_yes = s.isin(["yes","y","true","1"])
            mask_no  = s.isin(["no","n","false","0"])
            df.loc[mask_yes, c] = "Yes"
            df.loc[mask_no, c]  = "No"
    return df

def safe_filename(name: str) -> str:
    return re.sub(r'[\\/*?:"<>| ]+', "_", str(name))


In [None]:
# Paths (EDIT to your machine layout)
PROJECT_ROOT = Path("/Users/madhurabn/Desktop/adni")  # <-- change if needed
input_path   = PROJECT_ROOT / "data" / "raw" / "CSF biomarkers ADNI.xlsx"
outdir       = PROJECT_ROOT / "data" / "processed"
plots_dir    = outdir / "plots" / "csf"

outdir.mkdir(parents=True, exist_ok=True)
plots_dir.mkdir(parents=True, exist_ok=True)
print("Paths set:\n- input:", input_path, "\n- out:", outdir, "\n- plots:", plots_dir)


In [None]:
# Load, baseline-filter (one row per PTID), light tidy, map codes (NO diagnosis dropping here)
xl = pd.ExcelFile(input_path)
df = xl.parse(xl.sheet_names[0])
df.columns = normalize_colnames(df.columns)

ptid_col  = find_exact_col(df, PTID_TOKENS)
visit_col = find_exact_col(df, VISIT_TOKENS)
if ptid_col is None or visit_col is None:
    raise ValueError(f"Required columns not found. PTID={ptid_col}, VISIT={visit_col}\nGot: {list(df.columns)}")

gender_col = find_contains_col(df, GENDER_TOKENS)
age_col    = find_contains_col(df, AGE_TOKENS)

# baseline by visit priority
work = df.copy()
work["_visit_priority"] = work[visit_col].apply(parse_visit_priority)
work_sorted = work.sort_values(by=["_visit_priority"]).copy()
baseline = work_sorted.drop_duplicates(subset=[ptid_col], keep="first").copy()
baseline.drop(columns=["_visit_priority"], inplace=True)

# light tidy
baseline, dropped_empty = drop_empty_columns(baseline, keep_cols=[ptid_col, visit_col, gender_col])
baseline = standardize_yes_no(baseline)

print("Baseline shape:", baseline.shape)


In [None]:
# Detect CSF columns and compute ratios if present

def find_col(df, tokens):
    return find_contains_col(df, tokens)

col_tau   = find_col(baseline, TAU_TOKENS)
col_ptau  = find_col(baseline, PTAU_TOKENS)
col_ab42  = find_col(baseline, ABETA42_TOKENS)
col_ab40  = find_col(baseline, ABETA40_TOKENS)

print("Detected CSF biomarker columns:\n - Total Tau:", col_tau, "\n - p-Tau:", col_ptau, "\n - Aβ42:", col_ab42, "\n - Aβ40:", col_ab40)

# ratios
if col_tau and col_ab42 and col_tau in baseline.columns and col_ab42 in baseline.columns:
    baseline["Tau_over_Abeta42"] = pd.to_numeric(baseline[col_tau], errors="coerce") / pd.to_numeric(baseline[col_ab42], errors="coerce")
if col_ptau and col_ab42 and col_ptau in baseline.columns and col_ab42 in baseline.columns:
    baseline["pTau_over_Abeta42"] = pd.to_numeric(baseline[col_ptau], errors="coerce") / pd.to_numeric(baseline[col_ab42], errors="coerce")
if col_ab42 and col_ab40 and col_ab42 in baseline.columns and col_ab40 in baseline.columns:
    baseline["Abeta42_over_Abeta40"] = pd.to_numeric(baseline[col_ab42], errors="coerce") / pd.to_numeric(baseline[col_ab40], errors="coerce")

# save baseline (with ratios)
baseline_xlsx = outdir / "clinical_csf_baseline.xlsx"
baseline.to_excel(baseline_xlsx, index=False)
print("Saved baseline with ratios ->", baseline_xlsx)


In [None]:
# Quick statistical summary after baseline + ratios
print("\n=== CSF Baseline Summary (after ratios) ===")
print("Shape:", baseline.shape)

# overall missingness
total_missing = int(baseline.isna().sum().sum())
print("Total missing cells:", total_missing)

# top missing columns
miss_per_col = baseline.isna().sum().sort_values(ascending=False)
print("\nTop missing columns (first 15):")
print(miss_per_col.head(15))

# basic describe for numeric columns
num_desc = baseline.select_dtypes(include=[np.number]).describe().T
print("\nNumeric describe (head):")
print(num_desc.head(12))


In [None]:
# EDA after ratios
from pandas.plotting import scatter_matrix

# 1) Histogram/KDE of each ratio
ratio_cols = [c for c in ["Tau_over_Abeta42","pTau_over_Abeta42","Abeta42_over_Abeta40"] if c in baseline.columns]
for c in ratio_cols:
    series = pd.to_numeric(baseline[c], errors="coerce").dropna()
    if series.empty:
        continue
    plt.figure(figsize=(6,4))
    if _HAS_SNS:
        sns.histplot(series, bins=30, kde=True)
        plt.title(f"Histogram/KDE: {c}")
    else:
        plt.hist(series, bins=30); plt.title(f"Histogram: {c}")
    plt.tight_layout(); plt.savefig(plots_dir / f"hist_{safe_filename(c)}.png", dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()

# 2) Correlation heatmap including ratios
num_df = baseline.select_dtypes(include=[np.number]).copy()
if not num_df.empty and _HAS_SNS:
    plt.figure(figsize=(8,6))
    sns.heatmap(num_df.corr(numeric_only=True), cmap="vlag", center=0)
    plt.title("Correlation heatmap (incl. ratios)")
    plt.tight_layout(); plt.savefig(plots_dir / "corr_heatmap_csf.png", dpi=150)
    if SHOW_PLOTS: plt.show()
    plt.close()

# 3) Optional scatter matrix (may be heavy on large datasets)
try:
    sm_cols = [c for c in ratio_cols if c in baseline.columns][:4]  # limit to first few ratios
    if len(sm_cols) >= 2:
        scatter_matrix(baseline[sm_cols].dropna(), figsize=(8,8), diagonal='kde')
        plt.suptitle("Scatter matrix of CSF ratios", y=1.02)
        plt.tight_layout(); plt.savefig(plots_dir / "scatter_matrix_ratios.png", dpi=150)
        if SHOW_PLOTS: plt.show()
        plt.close()
except Exception as e:
    print("Scatter matrix skipped:", e)

# 4) If a diagnosis-like column exists, violin/box by diagnosis
# (will auto-detect a diagnosis-like column if present, but won't error if missing)
possible_diag = ["diagnosis","diag","dx","dx_bl","diagnosis_bl"]
diag_guess = None
for cand in possible_diag:
    c0 = find_contains_col(baseline, [cand])
    if c0:
        diag_guess = c0; break

if diag_guess:
    for c in ratio_cols:
        if _HAS_SNS:
            plt.figure(figsize=(7,4))
            sns.violinplot(data=baseline, x=diag_guess, y=c, inner="quartile", cut=0)
            plt.title(f"{c} by {diag_guess}")
            plt.tight_layout(); plt.savefig(plots_dir / f"violin_{safe_filename(c)}_by_{safe_filename(diag_guess)}.png", dpi=150)
            if SHOW_PLOTS: plt.show()
            plt.close()
        else:
            # fallback to boxplot
            plt.figure(figsize=(7,4))
            groups, labels = [], []
            for dlab, sub in baseline.groupby(diag_guess):
                vals = pd.to_numeric(sub[c], errors="coerce").dropna().values
                if len(vals): groups.append(vals); labels.append(str(dlab))
            if groups:
                plt.boxplot(groups, labels=labels)
                plt.title(f"{c} by {diag_guess}")
                plt.tight_layout(); plt.savefig(plots_dir / f"box_{safe_filename(c)}_by_{safe_filename(diag_guess)}.png", dpi=150)
                if SHOW_PLOTS: plt.show()
                plt.close()

print("EDA complete. Plots saved to:", plots_dir)


In [None]:
# Imputation: MICE (numeric) + IQR clipping; categorical via mode; single output Excel
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer, SimpleImputer

# copies
df_before = baseline.copy()
df_after  = baseline.copy()

# detect numeric vs categorical
obj_like = df_after.select_dtypes(include=['object','category']).columns.tolist()
num_like = df_after.select_dtypes(include=[np.number]).columns.tolist()

# simple heuristic for binary-like numeric (keep for mode imputation)
def _is_binary_series(s: pd.Series) -> bool:
    vals = s.dropna().unique()
    if len(vals) == 0: return False
    if pd.api.types.is_numeric_dtype(s):
        try:
            return set(pd.Series(vals).astype(float).astype(int)).issubset({0,1})
        except Exception:
            return False
    low = pd.Series(vals).astype(str).str.strip().str.lower()
    return low.isin({"yes","no","y","n","true","false","0","1"}).all()

binary_cols_num = [c for c in num_like if _is_binary_series(df_after[c])]
cont_cols = [c for c in num_like if c not in binary_cols_num]
cat_cols = list(obj_like)

# 1) Categorical/binary -> mode imputation (most_frequent)
cat_bin_to_impute = [c for c in cat_cols + binary_cols_num if df_after[c].isna().any()]
if cat_bin_to_impute:
    mode_imp = SimpleImputer(strategy="most_frequent")
    df_after[cat_bin_to_impute] = mode_imp.fit_transform(df_after[cat_bin_to_impute])

# 2) Continuous -> MICE
if cont_cols:
    mice = IterativeImputer(random_state=42, sample_posterior=False, max_iter=10, initial_strategy="median")
    df_after[cont_cols] = mice.fit_transform(df_after[cont_cols])

    # --- IQR clipping to keep values clinically plausible ---
    for c in cont_cols:
        series = pd.to_numeric(df_after[c], errors="coerce")
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        if pd.isna(q1) or pd.isna(q3):
            continue
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        # Non-negative biomarker hint
        nonneg_hint = any(kw in c.lower() for kw in ["tau","beta","abeta","aβ","pg_ml","csf"]) or c.lower().endswith(("42","40"))
        clipped = series.clip(lower=lower, upper=upper)
        if nonneg_hint:
            clipped = clipped.clip(lower=0)
        df_after[c] = clipped

# enforce binary back to int 0/1 (if numeric)
for c in binary_cols_num:
    if pd.api.types.is_numeric_dtype(df_after[c]):
        df_after[c] = pd.to_numeric(df_after[c], errors="coerce").round().clip(0,1).astype("Int64")

# Save outputs
baseline_out = outdir / "clinical_csf_baseline.xlsx"  # already saved above as well
imputed_out  = outdir / "clinical_csf_imputed.xlsx"

df_after.to_excel(imputed_out, index=False)
print("Saved imputed file ->", imputed_out)

# Quick post-imputation summary
print("\n=== MICE (numeric) + IQR clip + Mode (categorical) Summary ===")
print("Rows:", df_after.shape[0], "Cols:", df_after.shape[1])
print("Missing before:", int(df_before.isna().sum().sum()))
print("Missing after :", int(df_after.isna().sum().sum()))
