
# ADNI Diagnosis — Full Fusion Pipeline (ANOVA + ADASYN)

This notebook trains multi-class diagnostic models (CN / MCI / DEMENTIA) on your **full fusion** dataset.

**What it does**
- Loads `fusion_master.xlsx` and detects `PTID` and `Diagnosis`.
- Drops rows with missing `Diagnosis` and removes any pre-existing one-hot `diagnosis_*` columns from features to prevent leakage.
- Encodes Diagnosis → integers (CN=0, MCI=1, DEMENTIA=2).
- Preprocesses: median-impute numerics + standardize, one-hot encode categoricals.
- **Feature selection**: ANOVA (SelectKBest(f_classif)).
- **Class imbalance**: ADASYN inside CV folds (no leakage).
- Trains models: **RandomForest, XGBoost, SVM**, and a **Stacking Ensemble**.
- Evaluates with **Accuracy, Macro-F1, Macro ROC-AUC**, and **confusion matrices**.


In [None]:

# === User config ===
from pathlib import Path
FUSION_XLSX = r"/Users/madhurabn/Desktop/adni/data/processed/fusion_master.xlsx"  # <--- set to your path
OUTDIR = Path("/Users/madhurabn/Desktop/adni/out/full_fusion")
OUTDIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
N_FOLDS = 5
K_SELECTED = 30  # number of features to keep via ANOVA

print("Output dir:", OUTDIR.resolve())


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re, warnings
warnings.filterwarnings("ignore")

# plotting inline & style
%matplotlib inline

# sklearn / imblearn
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import ADASYN

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# xgboost (must be installed in your environment)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    print("xgboost not available:", e)
    HAS_XGB = False


In [None]:

# === Load fusion master ===
df = pd.read_excel(FUSION_XLSX)
print("Loaded:", df.shape)
print("Columns (head):", df.columns[:15].tolist())

# --- find PTID and Diagnosis columns ---
def _first_match(cols, tokens):
    tokens = [t.lower() for t in tokens]
    for c in cols:
        lc = str(c).lower()
        if any(t in lc for t in tokens):
            return c
    return None

ptid_col = _first_match(df.columns, ["ptid","rid","subject"])
diag_col = _first_match(df.columns, ["diagnosis","dx","dxgroup","clinical_diagnosis"])

if ptid_col is None or diag_col is None:
    raise ValueError(f"Could not find PTID or Diagnosis. Found PTID={ptid_col}, Diagnosis={diag_col}")

print("Detected PTID column:", ptid_col)
print("Detected Diagnosis column:", diag_col)

# drop rows with missing diagnosis
before = df.shape[0]
df = df[~df[diag_col].isna()].copy()
print(f"Dropped {before - df.shape[0]} rows with empty {diag_col}. Current shape: {df.shape}")

# remove any pre-existing one-hot diagnosis_* columns from features
import re
drop_diag_dummies = [c for c in df.columns if re.match(r"(?i)diagnosis[_\- ]?(cn|mci|ad|dementia)", str(c))]
if drop_diag_dummies:
    print("Dropping pre-existing diagnosis dummy columns from features:", drop_diag_dummies)
    df = df.drop(columns=drop_diag_dummies)

# map diagnosis to integers {CN:0, MCI:1, DEMENTIA:2} (case-insensitive)
def _map_diag(v):
    if pd.isna(v): return np.nan
    s = str(v).strip().lower()
    if s in {"cn", "control", "normal"}: return 0
    if s in {"mci"}: return 1
    if s in {"ad", "dementia", "alzheimers", "alzheimer's"}: return 2
    return np.nan

y_int = df[diag_col].map(_map_diag)
if y_int.isna().any():
    bad = df.loc[y_int.isna(), diag_col].unique()[:10]
    raise ValueError(f"Unknown diagnosis labels encountered: {bad}. Please adjust mapping.")

y = y_int.values.astype(int)

# X = all non-target columns except obvious IDs / meta
meta_like = {ptid_col, diag_col, "visit", "viscode"}
X = df.drop(columns=[c for c in df.columns if c in meta_like]).copy()

# basic type split
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"Feature split -> numeric: {len(num_cols)}, categorical: {len(cat_cols)}")


In [None]:

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import ADASYN

def build_pipeline(classifier):
    pre = ColumnTransformer(
        transformers=[
            ("num", SkPipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("sc", StandardScaler())
            ]), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
        ]
    )
    selector = SelectKBest(score_func=f_classif, k=min(K_SELECTED, max(1, len(num_cols) + len(cat_cols))))
    pipe = ImbPipeline(steps=[
        ("pre", pre),
        ("adasyn", ADASYN(random_state=RANDOM_STATE)),
        ("sel", selector),
        ("clf", classifier),
    ])
    return pipe


In [None]:

# === Define models ===
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, random_state=RANDOM_STATE, n_jobs=-1
)

svm = SVC(
    kernel="rbf", C=2.0, gamma="scale", probability=True, random_state=RANDOM_STATE
)

if HAS_XGB:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(
        n_estimators=500, max_depth=5, subsample=0.9, colsample_bytree=0.9,
        learning_rate=0.05, objective="multi:softprob", num_class=3,
        random_state=RANDOM_STATE, eval_metric="mlogloss", tree_method="hist"
    )
else:
    xgb = None

estimators_for_stack = []
estimators_for_stack.append(("rf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)))
if HAS_XGB:
    estimators_for_stack.append(("xgb", XGBClassifier(
        n_estimators=300, max_depth=4, learning_rate=0.05, subsample=0.9,
        colsample_bytree=0.9, objective="multi:softprob", num_class=3,
        random_state=RANDOM_STATE, eval_metric="mlogloss", tree_method="hist"
    )))
estimators_for_stack.append(("svm", SVC(kernel="rbf", C=2.0, gamma="scale", probability=True, random_state=RANDOM_STATE)))

stack_final = LogisticRegression(max_iter=200, multi_class="ovr")
stack = StackingClassifier(estimators=estimators_for_stack, final_estimator=stack_final, cv=3, stack_method="auto")


In [None]:

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np

def evaluate_model(name, base_clf):
    pipe = build_pipeline(base_clf)
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    # probability predictions for ROC-AUC
    y_proba = cross_val_predict(pipe, X, y, cv=cv, method="predict_proba", n_jobs=-1)
    y_pred  = y_proba.argmax(axis=1)

    acc  = accuracy_score(y, y_pred)
    f1   = f1_score(y, y_pred, average="macro")

    # macro ROC-AUC (one-vs-rest)
    y_bin = label_binarize(y, classes=[0,1,2])
    try:
        auc = roc_auc_score(y_bin, y_proba, average="macro", multi_class="ovr")
    except Exception:
        auc = np.nan

    # confusion matrix (on CV preds)
    cm = confusion_matrix(y, y_pred, labels=[0,1,2])

    # plots
    fig, ax = plt.subplots(1, 2, figsize=(10,4))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["CN","MCI","DEMENTIA"])
    disp.plot(ax=ax[0], colorbar=False)
    ax[0].set_title(f"{name} — Confusion Matrix (CV)")

    ax[1].bar(["Accuracy","Macro-F1","ROC-AUC"], [acc, f1, auc])
    ax[1].set_ylim(0,1)
    ax[1].set_title(f"{name} — Metrics (CV)")
    for i, v in enumerate([acc, f1, auc]):
        ax[1].text(i, v+0.02 if v<=0.95 else v-0.08, f"{v:.3f}", ha="center")
    plt.tight_layout()
    plt.show()

    # save
    out_txt = OUTDIR / f"{name.replace(' ','_').lower()}_metrics.txt"
    with open(out_txt, "w") as f:
        f.write(f"{name} (CV={N_FOLDS})\n")
        f.write(f"Accuracy: {acc:.4f}\nMacro-F1: {f1:.4f}\nMacro ROC-AUC: {auc:.4f}\n")
        f.write("\nConfusion matrix (rows=true, cols=pred):\n")
        f.write(str(cm))
    print("Saved metrics ->", out_txt)


In [None]:

results = []

print("\n=== Random Forest ===")
evaluate_model("Random Forest", rf)

if HAS_XGB:
    print("\n=== XGBoost ===")
    evaluate_model("XGBoost", xgb)
else:
    print("\n[Skip] XGBoost not available in this environment")

print("\n=== SVM (RBF) ===")
evaluate_model("SVM (RBF)", svm)

print("\n=== Stacking Ensemble ===")
evaluate_model("Stacking Ensemble", stack)

print("\nDone.")
