
# Full-Fusion Diagnosis Pipeline (CV + ADASYN + RF/XGB/SVM + Stacking)
This notebook:
- Loads your **fusion_master.xlsx** (or any fusion file you point to)
- Drops rows with empty **Diagnosis**
- Preprocesses features (numeric + one-hot for any leftover categoricals)
- Handles **class imbalance with ADASYN inside CV**
- Trains **RandomForest, XGBoost, SVM (RBF)**, and a **Stacking Ensemble**
- Reports **Accuracy, Macro-F1, ROC-AUC (OvR), Confusion Matrix**
- Saves figures/metrics under `./diagnostics_out/`

> Tip: Run cell-by-cell the first time to confirm paths and column names.


In [None]:

# === User config ===
FUSION_XLSX = r"./fusion_master.xlsx"  # <- change if stored elsewhere
OUTDIR = Path("./diagnostics_out")
OUTDIR.mkdir(parents=True, exist_ok=True)

# Optional: force column names for id/target if auto-detect fails
FORCE_PTID_COL = None          # e.g., "PTID"
FORCE_DIAG_COL = None          # e.g., "Diagnosis"

RANDOM_STATE = 42
N_FOLDS = 5

print("Output dir:", OUTDIR.resolve())


In [None]:

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Modeling / preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Sampler (inside CV)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import ADASYN

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    print("xgboost not available:", e)
    HAS_XGB = False

# Metrics
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
)

import seaborn as sns
sns.set_context("notebook")


In [None]:

# === Load fusion file ===
df = pd.read_excel(FUSION_XLSX)
print("Loaded fusion:", df.shape)

# --- normalize column names (keep original too) ---
orig_cols = df.columns.tolist()
df.columns = [str(c).strip() for c in df.columns]

# --- detect PTID and Diagnosis ---
def find_col(cands, cols):
    cols_lower = {c.lower(): c for c in cols}
    for tok in cands:
        for c in cols:
            if tok == c.lower():
                return c
        for c in cols:
            if tok in c.lower():
                return c
    return None

ptid_col = FORCE_PTID_COL or find_col(["ptid","rid","subject","id"], df.columns)
diag_col = FORCE_DIAG_COL or find_col(["diagnosis","dx","diag","group"], df.columns)

print("Detected PTID:", ptid_col)
print("Detected Diagnosis:", diag_col)

if ptid_col is None or diag_col is None:
    raise ValueError("Could not detect PTID or Diagnosis column. Set FORCE_PTID_COL / FORCE_DIAG_COL.")

# Drop rows with empty diagnosis
before = df.shape[0]
df = df[~df[diag_col].isna()].copy()
after = df.shape[0]
print(f"Dropped rows with empty Diagnosis: {before-after}")

# Quick peek
df.head(3)


In [None]:

# === Split X/y and build preprocessors ===
y_raw = df[diag_col].astype(str)

# Encode labels (CN/MCI/Dementia -> 0/1/2 etc.)
le = LabelEncoder()
y = le.fit_transform(y_raw)
label_map = {i: cls for i, cls in enumerate(le.classes_)}
print("Label mapping:", label_map)

# Drop obvious meta columns from X
drop_like = {diag_col, ptid_col, "visit", "VISCODE", "viscode", "VisCode"}
X = df.drop(columns=[c for c in df.columns if c in drop_like], errors="ignore")

# Identify dtypes
cat_cols = X.select_dtypes(include=["object","category","bool"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", Pipeline(steps=[("scaler", StandardScaler())]), num_cols),
    ],
    remainder="drop"
)

print(f"X shape: {X.shape} -> numeric:{len(num_cols)} | categorical:{len(cat_cols)}")


In [None]:

# === Helpers: evaluation & plotting ===
def evaluate_cv(model_name, estimator, X, y, cv):
    """Return metrics dict and also save plots."""
    # cross_val_predict gives out-of-fold predictions
    y_pred = cross_val_predict(estimator, X, y, cv=cv, method=None, n_jobs=-1)

    # For ROC-AUC need probabilities/decision function
    proba_supported = hasattr(estimator, "predict_proba")
    decf_supported  = hasattr(estimator, "decision_function")
    if proba_supported:
        y_score = cross_val_predict(estimator, X, y, cv=cv, method="predict_proba", n_jobs=-1)
    elif decf_supported:
        y_score = cross_val_predict(estimator, X, y, cv=cv, method="decision_function", n_jobs=-1)
        # map decision_function outputs to [0,1] via softmax-ish if multiclass
        if y_score.ndim == 1:
            # binary margin -> pseudo proba
            from sklearn.utils.extmath import softmax
            y_score = np.vstack([1/(1+np.exp(y_score)), 1-1/(1+np.exp(y_score))]).T
        else:
            from sklearn.utils.extmath import softmax
            y_score = softmax(y_score)
    else:
        y_score = None

    acc  = accuracy_score(y, y_pred)
    f1m  = f1_score(y, y_pred, average="macro")

    roc = None
    if y_score is not None:
        try:
            roc = roc_auc_score(y, y_score, multi_class="ovr", average="macro")
        except Exception:
            roc = None

    # Confusion matrix
    cm = confusion_matrix(y, y_pred, labels=np.unique(y))

    # Save confusion matrix plot
    plt.figure(figsize=(4.5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=[label_map[i] for i in np.unique(y)],
                yticklabels=[label_map[i] for i in np.unique(y)])
    plt.title(f"{model_name} — Confusion Matrix (CV)")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(OUTDIR / f"{model_name}_confusion_matrix.png", dpi=150)
    plt.show()

    # Save classification report
    report = classification_report(y, y_pred, target_names=[label_map[i] for i in np.unique(y)])
    with open(OUTDIR / f"{model_name}_classification_report.txt", "w") as f:
        f.write(report)

    # (Optional) ROC curves: multi-class OvR (only if y_score)
    if y_score is not None and y_score.ndim > 1:
        from sklearn.preprocessing import label_binarize
        from sklearn.metrics import RocCurveDisplay
        classes = np.unique(y)
        y_bin = label_binarize(y, classes=classes)
        plt.figure(figsize=(6,5))
        for i, cls in enumerate(classes):
            try:
                RocCurveDisplay.from_predictions(y_true=y_bin[:, i], y_pred=y_score[:, i], name=f"Class {label_map[cls]}")
            except Exception:
                pass
        plt.title(f"{model_name} — ROC (OvR, CV)")
        plt.tight_layout(); plt.savefig(OUTDIR / f"{model_name}_roc_ovr.png", dpi=150); plt.show()

    metrics = {
        "model": model_name,
        "accuracy": acc,
        "macro_f1": f1m,
        "roc_auc_ovr_macro": roc
    }
    return metrics


In [None]:

# === CV & estimators (ADASYN inside pipeline) ===
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

estimators = {}

# RandomForest
estimators["RF"] = ImbPipeline(steps=[
    ("prep", preprocess),
    ("adasyn", ADASYN(random_state=RANDOM_STATE)),
    ("clf", RandomForestClassifier(
        n_estimators=400, max_depth=None, n_jobs=-1, random_state=RANDOM_STATE, class_weight=None
    ))
])

# SVM (RBF)
estimators["SVM_RBF"] = ImbPipeline(steps=[
    ("prep", preprocess),
    ("adasyn", ADASYN(random_state=RANDOM_STATE)),
    ("clf", SVC(C=5.0, kernel="rbf", gamma="scale", probability=True, random_state=RANDOM_STATE))
])

# XGB (if available)
if HAS_XGB:
    estimators["XGB"] = ImbPipeline(steps=[
        ("prep", preprocess),
        ("adasyn", ADASYN(random_state=RANDOM_STATE)),
        ("clf", XGBClassifier(
            n_estimators=600, max_depth=4, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9,
            objective="multi:softprob", eval_metric="mlogloss", random_state=RANDOM_STATE
        ))
    ])
else:
    print("XGB not available: skipping XGB model.")


In [None]:

# === Stacking Ensemble (RF + SVM + optional XGB) ===
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

base_estimators = []
base_estimators.append(("rf", estimators["RF"]))
base_estimators.append(("svm", estimators["SVM_RBF"]))
if "XGB" in estimators:
    base_estimators.append(("xgb", estimators["XGB"]))

# Note: To stack imblearn pipelines, we build meta-features via probabilities.
# We'll do a simple approach: train Stacking on preprocessed data with ADASYN inside
stack_final = LogisticRegression(max_iter=200, multi_class="ovr")

stack = StackingClassifier(
    estimators=[(name, est) for name, est in base_estimators],
    final_estimator=stack_final,
    stack_method="predict_proba",
    passthrough=False,
    n_jobs=-1
)

estimators["STACK"] = stack
print("Models prepared:", list(estimators.keys()))


In [None]:

# === Evaluate all models ===
all_metrics = []
for name, est in estimators.items():
    print(f"\n=== Evaluating: {name} ===")
    m = evaluate_cv(name, est, X, y, cv)
    print(m)
    all_metrics.append(m)

metrics_df = pd.DataFrame(all_metrics).sort_values(by=["macro_f1","accuracy"], ascending=False)
display(metrics_df)
metrics_df.to_csv(OUTDIR / "cv_metrics_summary.csv", index=False)
print("\nSaved metrics to:", OUTDIR / "cv_metrics_summary.csv")
