
# Multimodal Diagnosis Pipeline (All Modalities)


In [1]:

# path
from pathlib import Path
FUSION_XLSX = Path("/Users/madhurabn/Desktop/adni/data/processed/fusion_master.xlsx")
OUTDIR = Path("/Users/madhurabn/Desktop/adni/output/Diagnosis_All modalities")
PLOTDIR = Path("/Users/madhurabn/Desktop/adni/output/Diagnosis_All modalities")


print("Output dir:", OUTDIR.resolve())
print("Plots dir :", PLOTDIR.resolve())


Output dir: /Users/madhurabn/Desktop/adni/output/Diagnosis_All modalities
Plots dir : /Users/madhurabn/Desktop/adni/output/Diagnosis_All modalities


In [2]:


import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
sns.set_context("notebook")
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import ADASYN
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression




In [3]:

#  Load fusion master 
df = pd.read_excel(FUSION_XLSX)
print("Loaded fusion shape:", df.shape)
print("Columns:", len(df.columns))
df.head(3)


Loaded fusion shape: (745, 118)
Columns: 118


Unnamed: 0,ptid,gender,entry_age,education,cdr_sum_of_boxes,mmscore,cardiovascular_disease_history,alcohol_abuse_history,smoking_history,hypertension_history,...,cortical_volume_aparc_stats_of_leftentorhinal,cortical_volume_aparc_stats_of_rightparsorbitalis,thickness_stardard_deviation_aparc_stats_of_leftprecuneus,thickness_average_aparc_stats_of_leftsuperiortemporal,thickness_average_aparc_stats_of_leftbankssts,thickness_average_aparc_stats_of_leftsuperiorfrontal,thickness_average_aparc_stats_of_leftcaudalmiddlefrontal,cortical_volume_aparc_stats_of_rightinsula,subcortical_volume_aseg_stats_of_leftaccumbensarea,cortical_volume_aparc_stats_of_righttransversetemporal
0,116_S_0834,0,63.88,16.058926,1,28.043849,0,0,0,0,...,2052.0,2691,0.592,2.335,2.131,2.63,2.467,6207.0,331.7,640
1,100_S_0930,0,75.05,17.158902,1,27.498431,0,0,0,0,...,1157.0,2653,0.534,2.546,1.972,2.451,2.21,5923.0,237.0,750
2,100_S_0892,1,72.8,16.695437,1,27.511328,0,0,0,0,...,1767.0,2405,0.57,2.746,2.296,2.583,2.439,7144.0,371.8,856


In [4]:

#  Identify PTID and Diagnosis; clean target 
def find_col(cols, tokens):
    for c in cols:
        lc = str(c).strip().lower()
        if any(tok in lc for tok in tokens):
            return c
    return None

ptid_col = find_col(df.columns, ["ptid","subject","rid","id"])
diag_col = find_col(df.columns, ["diagnosis","dx","label","class"])

if ptid_col is None or diag_col is None:
    raise ValueError(f"Could not find PTID/Diagnosis. Detected PTID={ptid_col}, DIAG={diag_col}")

# drop empty diagnosis
before = df.shape[0]
df = df[df[diag_col].notna()].copy()
print(f"Dropped {before - df.shape[0]} rows with empty {diag_col}")

# Normalize diagnosis labels to {CN, MCI, DEMENTIA}
def norm_diag(x):
    s = str(x).strip().upper()
    if s in {"CN", "CONTROL", "COGNITIVELY NORMAL"}:
        return "CN"
    if s in {"MCI", "EMCI", "LMCI"}:
        return "MCI"
    if s in {"AD", "DEMENTIA", "ALZHEIMER", "ALZHEIMER'S DISEASE"}:
        return "DEMENTIA"
    return s

df[diag_col] = df[diag_col].apply(norm_diag)

# Encode to 0/1/2
label_map = {"CN":0, "MCI":1, "DEMENTIA":2}
y = df[diag_col].map(label_map)
if y.isna().any():
    bad = df.loc[y.isna(), diag_col].value_counts()
    print("Unmapped diagnosis values found:\n", bad)
    raise ValueError("Please fix unmapped diagnosis labels.")

print("Class counts:\n", y.value_counts().rename(index={0:"CN",1:"MCI",2:"DEMENTIA"}))


Dropped 0 rows with empty diagnosis
Class counts:
 diagnosis
MCI         375
DEMENTIA    185
CN          185
Name: count, dtype: int64
