In [1]:
# Clinical / Cognitive / Demographic Baseline Pipeline (CogNID-style)
# ------------------------------------------------------------------
# Run cells top-to-bottom. No command-line arguments are needed.
# This notebook will:
#  1) Apply baseline visit priority: bl/init > sc > m03 > m06 > m12 > m24 > later mXX
#  2) Keep ONE row per PTID
#  3) Light tidy + normalize Yes/No + map Gender/Diagnosis codes
#  4) Save baseline Excel
#  5) Generate EDA plots
#  6) (Optional) Class-aware KNN imputation (CogNID-style) -> clinical_imputed.xlsx


In [2]:
# --- imports
import re
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- visit priority (CogNID-style)
VISIT_PRIORITY = {
    "bl": 1, "init": 1,
    "sc": 2, "screening": 2,
    "m03": 3, "month3": 3, "3m": 3,
    "m06": 4, "month6": 4, "6m": 4,
    "m12": 5, "month12": 5, "12m": 5,
    "m24": 6, "month24": 6, "24m": 6,
}

# --- mappings from your spec
GENDER_MAP = {1: "male", 2: "female", "1": "male", "2": "female"}
DIAG_MAP   = {1: "CN",   2: "MCI",    3: "DEMENTIA", "1": "CN", "2": "MCI", "3": "DEMENTIA"}

# --- column finder tokens
PTID_TOKENS   = ["ptid", "subjectid", "subject_id", "participantid", "participant_id"]
VISIT_TOKENS  = ["visit", "visist", "viscode", "viscode2"]
GENDER_TOKENS = ["gender"]
DIAG_TOKENS   = ["diagnosis", "diagnoses", "diag"]
AGE_TOKENS    = ["entry_age", "age", "ptage", "baselineage"]

MMSE_TOKENS   = ["mmscore", "mmse"]
CDRSB_TOKENS  = ["cdr sum of boxes", "cdrsb"]
FAQ_TOKENS    = ["faq total", "faq total score", "faq"]
ADAS_TOKENS   = ["adas13", "adas 13"]
COMORB_TOKENS = ["hypertension", "stroke", "smok", "diabet", "cardio", "t2dm"]

# --- small helpers
def normalize_colnames(cols):
    def norm(c):
        c2 = str(c).strip()
        c2 = re.sub(r"\s+", " ", c2)
        return c2
    return [norm(c) for c in cols]

def find_exact_col(df, candidate_keys):
    for c in df.columns:
        lc = c.lower().replace(" ", "")
        for cand in candidate_keys:
            if lc == cand:
                return c
    return None

def find_contains_col(df, token_list):
    for col in df.columns:
        lc = col.lower()
        for tok in token_list:
            if tok in lc:
                return col
    return None

def parse_visit_priority(raw):
    if pd.isna(raw):
        return 10_000
    s = str(raw).strip().lower().replace(" ", "")
    if s in VISIT_PRIORITY:
        return VISIT_PRIORITY[s]
    m = re.match(r"m(\d+)", s)
    if m:
        try:
            months = int(m.group(1))
            base = 7
            return base + months
        except Exception:
            return 10_000
    v = re.match(r"v(\d+)", s)
    if v:
        return 9_000 + int(v.group(1))
    return 10_000

def drop_empty_columns(df: pd.DataFrame, keep_cols=None, min_non_null=1):
    keep_cols = [k for k in (keep_cols or []) if k is not None]
    drop = [c for c in df.columns if c not in keep_cols and df[c].notna().sum() < min_non_null]
    return df.drop(columns=drop), drop

def standardize_yes_no(df: pd.DataFrame, yes_tokens=("y","yes","1"), no_tokens=("n","no","0")):
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == object:
            lc = out[c].astype(str).str.strip().str.lower()
            mask_yes = lc.isin(yes_tokens)
            mask_no  = lc.isin(no_tokens)
            out.loc[mask_yes, c] = "Yes"
            out.loc[mask_no,  c] = "No"
    return out

def plot_bar_counts(series, title, out_png):
    plt.figure()
    series.value_counts(dropna=False).plot(kind="bar", title=title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=150)
    plt.close()

def boxplot_by_diag(df, feat_col, diag_col, out_png):
    plt.figure()
    ok = False
    if feat_col and diag_col and df[feat_col].notna().sum() > 0:
        groups, labels = [], []
        for dlab in df[diag_col].dropna().unique():
            vals = pd.to_numeric(df.loc[df[diag_col] == dlab, feat_col], errors="coerce").dropna().values
            if len(vals) > 0:
                groups.append(vals); labels.append(str(dlab))
        if groups:
            plt.boxplot(groups, labels=labels)
            plt.title(f"{feat_col} by {diag_col}")
            plt.xlabel(diag_col); plt.ylabel(feat_col)
            ok = True
    if not ok:
        plt.title(f"{feat_col} by {diag_col} (no data)")
    plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()

def missingness_heatmap(df, out_png):
    plt.figure(figsize=(8, 6))
    msk = df.isna()
    plt.imshow(msk.values, aspect="auto", interpolation="nearest")
    plt.title("Missingness heatmap (white = missing)")
    plt.xlabel("Columns"); plt.ylabel("Rows")
    plt.colorbar(); plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()


In [3]:
from pathlib import Path
import pandas as pd
import os, glob

# --- set your project root (absolute) ---
PROJECT_ROOT = Path("/Users/madhurabn/Desktop/adni")

# input file (edit the name if different)
INPUT_XLSX  = PROJECT_ROOT / "data" / "raw" / "dem_cli_cog ADNI.xlsx"

# outputs
OUTDIR      = PROJECT_ROOT / "data" / "processed"
PLOTSDIR    = OUTDIR / "plots"
OUTDIR.mkdir(parents=True, exist_ok=True)
PLOTSDIR.mkdir(parents=True, exist_ok=True)

print("CWD:", os.getcwd())
print("Project root:", PROJECT_ROOT)
print("Expecting input:", INPUT_XLSX)

# helpful auto-discovery if the exact name differs
if not INPUT_XLSX.exists():
    print("\nFile not found. Searching for candidates under data/raw ...")
    candidates = list((PROJECT_ROOT / "data" / "raw").rglob("*ADNI*.xlsx"))
    for i, p in enumerate(candidates, 1):
        print(f"{i}. {p}")
    if candidates:
        INPUT_XLSX = candidates[0]
        print("\nUsing first match:", INPUT_XLSX)
    else:
        raise FileNotFoundError("Could not find an ADNI .xlsx in data/raw/. Put your file there or update INPUT_XLSX.")

# read first sheet (correct API)
xl = pd.ExcelFile(INPUT_XLSX)
print("\nSheets:", xl.sheet_names)
df = xl.parse(xl.sheet_names[0])   # or xl.parse("Sheet1") if you prefer
print("Loaded shape:", df.shape)

# make these names available to the rest of the notebook
input_path = INPUT_XLSX
outdir = OUTDIR
plots_dir = PLOTSDIR


CWD: /Users/madhurabn/Desktop/adni/pipelines
Project root: /Users/madhurabn/Desktop/adni
Expecting input: /Users/madhurabn/Desktop/adni/data/raw/dem_cli_cog ADNI.xlsx

Sheets: ['Sheet1']
Loaded shape: (9561, 17)


In [4]:
# --- load and baseline-filter (one row per PTID) + light tidy + mappings + save
xl = pd.ExcelFile(input_path)
df = xl.parse(xl.sheet_names[0])
df.columns = normalize_colnames(df.columns)

ptid_col  = find_exact_col(df, PTID_TOKENS)
visit_col = find_exact_col(df, VISIT_TOKENS)
if ptid_col is None or visit_col is None:
    raise ValueError(f"Required columns not found. PTID={ptid_col}, VISIT={visit_col}\nGot: {list(df.columns)}")

gender_col = find_contains_col(df, GENDER_TOKENS)
diag_col   = find_contains_col(df, DIAG_TOKENS)
age_col    = find_contains_col(df, AGE_TOKENS)

# apply visit priority â†’ keep earliest per PTID
work = df.copy()
work["_visit_priority"] = work[visit_col].apply(parse_visit_priority)
work_sorted = work.sort_values(by=["_visit_priority"]).copy()
baseline = work_sorted.drop_duplicates(subset=[ptid_col], keep="first").copy()
baseline.drop(columns=["_visit_priority"], inplace=True)

# light tidy
baseline, dropped_empty = drop_empty_columns(baseline, keep_cols=[ptid_col, visit_col, gender_col, diag_col])
baseline = standardize_yes_no(baseline)

# map codes
if gender_col:
    baseline[gender_col] = baseline[gender_col].map(lambda x: GENDER_MAP.get(x, x))
if diag_col:
    baseline[diag_col] = baseline[diag_col].map(lambda x: DIAG_MAP.get(x, x))

# save baseline (CogNID style: one Excel per modality)
baseline_xlsx = outdir / "clinical_cognitive_demographic_baseline.xlsx"
baseline.to_excel(baseline_xlsx, index=False)
baseline.head(5)


Unnamed: 0,PTID,visit,gender,GENOTYPE,entry_age,entry_date,education,CDR Sum of Boxes,MMSCORE,Cardiovascular disease history,Alcohol abuse history,Smoking history,Hypertension history,Stroke / TIA history,DIAGNOSIS,FAQ Total Score,Total Score (ADAS 13)
0,941_S_10002,bl,female,3/4,72.95,2023-08-15,,,,,,,,,MCI,1.0,10.0
4110,005_S_0929,bl,male,3/4,82.05,2006-09-18,,,,,,,,,DEMENTIA,10.0,20.67
4098,116_S_0834,bl,male,3/4,63.88,2006-09-19,,,,,,,,,MCI,5.0,18.0
4089,100_S_0930,bl,male,3/4,75.05,2006-09-19,,,,,,,,,MCI,1.0,25.33
4069,033_S_0922,bl,male,3/4,68.12,2006-09-15,,,,,,,,,MCI,8.0,29.33


In [5]:
# --- EDA plots (saved to PNGs)
# 1) Age histogram
if age_col:
    series = pd.to_numeric(baseline[age_col], errors="coerce").dropna()
    plt.figure()
    series.plot(kind="hist", bins=30, title=f"Histogram of {age_col}")
    plt.xlabel(age_col); plt.tight_layout()
    plt.savefig(plots_dir / "age_hist.png", dpi=150); plt.close()

# 2) Gender distribution
if gender_col:
    plot_bar_counts(baseline[gender_col], f"Gender distribution ({gender_col})", plots_dir / "gender_bar.png")

# 3) Genotype distribution
geno_col = find_contains_col(baseline, ["genotype", "apoe"])
if geno_col:
    plot_bar_counts(baseline[geno_col], f"Genotype distribution ({geno_col})", plots_dir / "genotype_bar.png")

# 4) Cognitive/functional by Diagnosis
for tokens, fname in [
    (MMSE_TOKENS, "mmse_by_diag.png"),
    (CDRSB_TOKENS, "cdrsb_by_diag.png"),
    (FAQ_TOKENS, "faq_by_diag.png"),
    (ADAS_TOKENS, "adas13_by_diag.png"),
]:
    feat_col = find_contains_col(baseline, tokens)
    boxplot_by_diag(baseline, feat_col, diag_col, plots_dir / fname)

# 5) Comorbidity prevalence (heuristic)
comorb_cols = [c for c in baseline.columns if any(tok in c.lower() for tok in COMORB_TOKENS)]
if comorb_cols:
    counts = {}
    for c in comorb_cols:
        vc = baseline[c].value_counts(dropna=False)
        pos = 0
        if 1 in vc.index: pos = max(pos, int(vc.get(1, 0)))
        if "Yes" in vc.index: pos = max(pos, int(vc.get("Yes", 0)))
        counts[c] = pos
    ser = pd.Series(counts).sort_values(ascending=False) if counts else pd.Series(dtype=int)
    plt.figure()
    if not ser.empty:
        ser.plot(kind="bar", title="Comorbidity prevalence (heuristic positives)")
    else:
        plt.title("Comorbidity prevalence (no positives found)")
    plt.tight_layout(); plt.savefig(plots_dir / "comorbidities_bar.png", dpi=150); plt.close()

# 6) Missingness heatmap
plt.figure(figsize=(8, 6))
msk = baseline.isna()
plt.imshow(msk.values, aspect="auto", interpolation="nearest")
plt.title("Missingness heatmap (white = missing)"); plt.xlabel("Columns"); plt.ylabel("Rows")
plt.colorbar(); plt.tight_layout(); plt.savefig(plots_dir / "missingness_heatmap.png", dpi=150); plt.close()

print("Saved:")
print("-", baseline_xlsx)
print("-", plots_dir)


  plt.boxplot(groups, labels=labels)
  plt.boxplot(groups, labels=labels)
  plt.boxplot(groups, labels=labels)
  plt.boxplot(groups, labels=labels)


Saved:
- /Users/madhurabn/Desktop/adni/data/processed/clinical_cognitive_demographic_baseline.xlsx
- /Users/madhurabn/Desktop/adni/data/processed/plots
