In [3]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
dataset_path = 'data/'  
csv_file = os.path.join(dataset_path, 'alzheimers_prediction_dataset.csv')  
df = pd.read_csv(csv_file)
print(df.head())

        Country  Age  Gender  Education Level   BMI Physical Activity Level  \
0         Spain   90    Male                1  33.0                  Medium   
1     Argentina   72    Male                7  29.9                  Medium   
2  South Africa   86  Female               19  22.9                    High   
3         China   53    Male               17  31.2                     Low   
4        Sweden   58  Female                3  30.0                    High   

  Smoking Status Alcohol Consumption Diabetes Hypertension  ...  \
0          Never        Occasionally       No           No  ...   
1         Former               Never       No           No  ...   
2        Current        Occasionally       No          Yes  ...   
3          Never           Regularly      Yes           No  ...   
4         Former               Never      Yes           No  ...   

  Dietary Habits Air Pollution Exposure  Employment Status Marital Status  \
0        Healthy                   High      

In [13]:
default_models_params = {
    'lr': {
        'model_cls': LogisticRegression,
        'init_params': {'class_weight': 'balanced', 'random_state': 42, 'max_iter': 1000},
        'param_grid': {'C': [0.01, 0.1, 1, 10]}
    },
    'rf': {
        'model_cls': RandomForestClassifier,
        'init_params': {'class_weight': 'balanced', 'random_state': 42},
        'param_grid': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
    },
    #'svm': {
    #    'model_cls': SVC,
    #    'init_params': {'probability': True, 'class_weight': 'balanced', 'random_state': 42},
    #    'param_grid': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    #},
    'gb': {
        'model_cls': GradientBoostingClassifier,
        'init_params': {'random_state': 42},
        'param_grid': {'n_estimators': [50, 100], 'max_depth': [3, 5]}
    },
    'knn': {
        'model_cls': KNeighborsClassifier,
        'init_params': {},
        'param_grid': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    },
    'xgb': {
        'model_cls': XGBClassifier,
        'init_params': {'random_state': 42, 'eval_metric': 'auc'},
        'param_grid': {'n_estimators': [50, 100], 'max_depth': [3, 5], 'learning_rate': [0.01, 0.1]}
    },
    'mlp': {
        'model_cls': MLPClassifier,
        'init_params': {'random_state': 42, 'max_iter': 1000},
        'param_grid': {'hidden_layer_sizes': [(50,), (100,)], 'activation': ['relu', 'tanh'], 'alpha': [0.0001, 0.001]}
    }
}

os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)
def train_and_save_models(df, models_params=None, validation=True, verbose=True):
    """Train models from scratch and save them. Handle multiple models, validation split, and performance logging."""
        
    if models_params is None:
        models_params = default_models_params
    
    X = df.drop('Alzheimer’s Diagnosis', axis=1)
    y = df['Alzheimer’s Diagnosis'].map({'No': 0, 'Yes': 1})
    
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = X.select_dtypes(include=['object']).columns
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), num_cols),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), cat_cols)
        ])
    
    X_processed = preprocessor.fit_transform(X)
    
    if validation:
        # Split into train (60%), val (20%), test (20%)
        X_temp, X_test, y_temp, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)
        X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)
        if verbose:
            print("Data split: Train/Val/Test")
    else:
        X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)
        X_val, y_val = None, None
        if verbose:
            print("Data split: Train/Test (no validation)")
    
    trained_models = {}
    
    for model_name, model_info in models_params.items():
        if verbose:
            print(f"\nTraining {model_name}...")
        
        model_cls = model_info['model_cls']
        init_params = model_info.get('init_params', {}).copy()
        
        if model_name == 'xgb':
            scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum() if (y_train == 1).sum() > 0 else 1
            init_params['scale_pos_weight'] = scale_pos_weight
            if verbose:
                print(f"Set scale_pos_weight to {scale_pos_weight} for XGBoost")
        
        model = model_cls(**init_params)
        grid = GridSearchCV(model, model_info['param_grid'], cv=5, scoring='roc_auc', n_jobs=-1)
        grid.fit(X_train, y_train)
        
        trained_models[model_name] = grid
        
        model_path = os.path.join('models', f'alzheimers_{model_name}_model.pkl')
        joblib.dump(grid.best_estimator_, model_path)

        #grid_path = os.path.join('models', f'alzheimers_{model_name}_gridsearch.pkl')
        #joblib.dump(grid, grid_path)
        
        if verbose:
            print(f"Best params for {model_name}: {grid.best_params_}")
        
        train_metrics = evaluate_model(X_train, y_train, model_type=model_name, processed_data=True, filepath=os.path.join('results', 'train_model_performances.csv'))
        if verbose:
            print(f"{model_name} Train ROC-AUC: {train_metrics.get('roc_auc', 'N/A')}")
        
        if validation and X_val is not None:
            val_metrics = evaluate_model(X_val, y_val, model_type=model_name, processed_data=True, filepath=os.path.join('results', 'val_model_performances.csv'))
            if verbose:
                print(f"{model_name} Val ROC-AUC: {val_metrics.get('roc_auc', 'N/A')}")
    
    preprocessor_path = os.path.join('models', 'preprocessor.pkl')
    joblib.dump(preprocessor, preprocessor_path)

    feature_info_path = os.path.join('models', 'feature_info.pkl')
    joblib.dump({
        'num_cols': list(num_cols),
        'cat_cols': list(cat_cols),
        'all_features': list(X.columns)
    }, feature_info_path)
    
    if verbose:
        print("\nModels, preprocessor, and feature info saved successfully!")
        print(f"Models saved in: {os.path.relpath('models')}")
        print(f"Results saved in: {os.path.relpath('results')}")
    
    return trained_models, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test

def load_models_and_preprocessor(model_types=None):
    """Load previously saved models and preprocessor."""
    if model_types is None:
        model_types = list(default_models_params.keys()) 
    
    if isinstance(model_types, str):
        model_types = [model_types]
    
    models = {}
    for mt in model_types:
        try:
            model_path = os.path.join('models', f'alzheimers_{mt}_model.pkl')
            models[mt] = joblib.load(model_path)
            print(f"Loaded {mt} model from: {os.path.relpath(model_path)}")
        except FileNotFoundError:
            print(f"Warning: {mt} model file not found in models folder.")
            alt_path = f'alzheimers_{mt}_model.pkl'
            if os.path.exists(alt_path):
                models[mt] = joblib.load(alt_path)
                print(f"Loaded {mt} model from alternative location: {alt_path}")
    
    try:
        preprocessor_path = os.path.join('models', 'preprocessor.pkl')
        preprocessor = joblib.load(preprocessor_path)
        
        feature_info_path = os.path.join('models', 'feature_info.pkl')
        feature_info = joblib.load(feature_info_path)
        
        print(f"Loaded preprocessor and feature info from models folder")
    except FileNotFoundError as e:
        print(f"Error loading preprocessor or feature info: {e}")
        try:
            preprocessor = joblib.load('preprocessor.pkl')
            feature_info = joblib.load('feature_info.pkl')
            print("Loaded from alternative locations")
        except:
            preprocessor = None
            feature_info = None
    
    return models, preprocessor, feature_info

def evaluate_model(new_X, new_y, model_type='rf', processed_data=False, filepath=None):
    """
    Evaluate model performance on new data.
    """
    if filepath is None:
        filepath = os.path.join('results', 'model_performances.csv')
    elif not os.path.dirname(filepath):  
        filepath = os.path.join('results', filepath)

    if isinstance(model_type, list):
        models, preprocessor, _ = load_models_and_preprocessor(model_type)
        metrics_dict = {}
        for mt in model_type:
            if mt not in models:
                continue
            model = models[mt]
            metrics_dict[mt] = _compute_metrics(new_X, new_y, model, preprocessor, processed_data)
        
        if metrics_dict:
            metrics_df = pd.DataFrame.from_dict(metrics_dict, orient='index')
            metrics_df.to_csv(filepath, index_label='model')
            print(f"Performances saved to {filepath}")
            
            best_model = metrics_df['roc_auc'].idxmax()
            best_score = metrics_df['roc_auc'].max()
            print(f"Best model: {best_model} with ROC-AUC {best_score}")
            return metrics_dict, best_model
        else:
            return {}, None
    
    else:
        models, preprocessor, _ = load_models_and_preprocessor([model_type])
        model = models.get(model_type)
        if model is None:
            print(f"Error: Model {model_type} not loaded.")
            return {}

        metrics = _compute_metrics(new_X, new_y, model, preprocessor, processed_data)

        if filepath:
            timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
            individual_path = os.path.join('results', f'{model_type}_performance_{timestamp}.csv')
            pd.DataFrame([metrics]).to_csv(individual_path, index=False)
        
        return metrics

def _compute_metrics(new_X, new_y, model, preprocessor, processed_data):
    """Helper to compute metrics"""

    if not processed_data:
        new_X = preprocessor.transform(new_X)
    
    y_pred = model.predict(new_X)
    metrics = {
        'accuracy': accuracy_score(new_y, y_pred),
        'precision': precision_score(new_y, y_pred, zero_division=0),
        'recall': recall_score(new_y, y_pred, zero_division=0),
        'f1_score': f1_score(new_y, y_pred, zero_division=0)
    }

    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(new_X)[:, 1]
        metrics['roc_auc'] = roc_auc_score(new_y, y_pred_proba)

    cm = confusion_matrix(new_y, y_pred)
    metrics['tn'], metrics['fp'], metrics['fn'], metrics['tp'] = cm.ravel()

    return metrics

In [14]:
trained_models, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = train_and_save_models(df, models_params=default_models_params)

Data split: Train/Val/Test

Training lr...
Best params for lr: {'C': 0.1}
Loaded lr model from: models/alzheimers_lr_model.pkl
Error loading preprocessor or feature info: [Errno 2] No such file or directory: 'models/preprocessor.pkl'
lr Train ROC-AUC: 0.7917843261176399
Loaded lr model from: models/alzheimers_lr_model.pkl
Error loading preprocessor or feature info: [Errno 2] No such file or directory: 'models/preprocessor.pkl'
lr Val ROC-AUC: 0.7859223021842926

Training rf...
Best params for rf: {'max_depth': 10, 'n_estimators': 200}
Loaded rf model from: models/alzheimers_rf_model.pkl
Error loading preprocessor or feature info: [Errno 2] No such file or directory: 'models/preprocessor.pkl'
rf Train ROC-AUC: 0.8519246963570855
Loaded rf model from: models/alzheimers_rf_model.pkl
Error loading preprocessor or feature info: [Errno 2] No such file or directory: 'models/preprocessor.pkl'
rf Val ROC-AUC: 0.7950601140270571

Training gb...
Best params for gb: {'max_depth': 3, 'n_estimators'

In [15]:
metrics, best = evaluate_model(X_test, y_test, model_type=['lr', 'rf', 'gb', 'knn', 'xgb', 'mlp'], processed_data=True, filepath='test_model_performances.csv')

Loaded lr model from: models/alzheimers_lr_model.pkl
Loaded rf model from: models/alzheimers_rf_model.pkl
Loaded gb model from: models/alzheimers_gb_model.pkl
Loaded knn model from: models/alzheimers_knn_model.pkl
Loaded xgb model from: models/alzheimers_xgb_model.pkl
Loaded mlp model from: models/alzheimers_mlp_model.pkl
Loaded preprocessor and feature info from models folder
Performances saved to results/test_model_performances.csv
Best model: xgb with ROC-AUC 0.808035747811577


In [None]:
def predict_alzheimers(new_data_dict, model_type='rf', 
                       model=None, preprocessor=None, 
                       feature_info=None, threshold=0.5):
    """
    Predict Alzheimer's diagnosis for new sample
    
    Parameters:
    -----------
    new_data_dict : dict
        Dictionary containing feature values
    model_type : str
        Model type key, e.g., 'rf', 'xgb', etc.
    model : trained model or dict of models
        If None, loads from saved file
    preprocessor : fitted preprocessor
        If None, loads from saved file
    feature_info : dict
        Feature metadata, loaded if None
    threshold : float
        Decision threshold
    
    Returns:
    --------
    dict: Prediction results
    """
    
    if model is None or preprocessor is None or feature_info is None:
        models, preprocessor, feature_info = load_models_and_preprocessor(model_types=model_type)
        if not models or model_type not in models:
            return {"error": f"Model {model_type} not loaded"}
        model = models[model_type]
    elif isinstance(model, dict):
        if model_type in model:
            model = model[model_type]
        else:
            return {"error": f"Model {model_type} not found in provided dict"}
    
    new_df = pd.DataFrame([new_data_dict])
    
    expected_features = feature_info['all_features']
    missing_features = set(expected_features) - set(new_df.columns)
    extra_features = set(new_df.columns) - set(expected_features)
    
    if missing_features:
        print(f"Warning: Missing features: {missing_features}")
        for feat in missing_features:
            new_df[feat] = np.nan
    
    if extra_features:
        print(f"Warning: Extra features provided: {extra_features}")
        new_df = new_df[expected_features]
    
    try:
        new_processed = preprocessor.transform(new_df)
    except Exception as e:
        return {"error": f"Preprocessing failed: {str(e)}"}
    
    try:
        prob = model.predict_proba(new_processed)[0][1]
        prediction = 1 if prob >= threshold else 0
        confidence = prob if prediction == 1 else 1 - prob
        
        feature_importance = None
        if hasattr(model, 'feature_importances_'):
            importance_dict = dict(zip(
                preprocessor.get_feature_names_out(),
                model.feature_importances_
            ))

            top_features = sorted(importance_dict.items(), 
                                 key=lambda x: x[1], 
                                 reverse=True)[:10]
            feature_importance = dict(top_features)
        
        result = {
            'diagnosis': 'Yes' if prediction == 1 else 'No',
            'probability': float(prob),
            'confidence': float(confidence),
            'threshold_used': float(threshold),
            'model_used': model_type.upper(),
            'feature_importance': feature_importance,
            'risk_level': categorize_risk(prob)
        }
        
        return result
        
    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}

def categorize_risk(probability):
    """Categorize risk based on probability"""
    if probability < 0.3:
        return "Low Risk"
    elif probability < 0.6:
        return "Moderate Risk"
    elif probability < 0.8:
        return "High Risk"
    else:
        return "Very High Risk"

In [19]:
def predict_batch(new_data_df, model_type='rf', threshold=0.5):
    """
    Predict for multiple samples at once
    
    Parameters:
    -----------
    new_data_df : pandas DataFrame
        DataFrame containing multiple samples
    model_type : str
        Model type key, e.g., 'rf', 'xgb', etc.
    threshold : float
        Decision threshold
    
    Returns:
    --------
    DataFrame with predictions
    """
    
    models, preprocessor, feature_info = load_models_and_preprocessor(model_types=model_type)
    if not models or model_type not in models:
        print(f"Error: Model {model_type} not loaded.")
        return pd.DataFrame()
    
    model = models[model_type]
    
    expected_features = feature_info['all_features']
    missing_features = set(expected_features) - set(new_data_df.columns)
    
    if missing_features:
        print(f"Warning: Adding missing features: {missing_features}")
        for feat in missing_features:
            new_data_df[feat] = np.nan
    
    new_data_df = new_data_df[expected_features]
    
    try:
        new_processed = preprocessor.transform(new_data_df)
        probabilities = model.predict_proba(new_processed)[:, 1]
        predictions = (probabilities >= threshold).astype(int)
        
        results_df = new_data_df.copy()
        results_df['Predicted_Diagnosis'] = ['Yes' if p == 1 else 'No' for p in predictions]
        results_df['Probability'] = probabilities
        results_df['Risk_Level'] = [categorize_risk(p) for p in probabilities]
        results_df['Confidence'] = np.where(
            predictions == 1, 
            probabilities, 
            1 - probabilities
        )
        
        return results_df
        
    except Exception as e:
        print(f"Batch prediction failed: {e}")
        return pd.DataFrame()

In [23]:
sample_patient = {
    'Country': 'Spain',
    'Age': 90,
    'Gender': 'Male',
    'Education Level': 1,
    'BMI': 33.0,
    'Physical Activity Level': 'Medium',
    'Smoking Status': 'Never',
    'Alcohol Consumption': 'Occasionally',
    'Diabetes': 'No',
    'Hypertension': 'No',
    'Cholesterol Level': 'Normal',
    'Family History of Alzheimer’s': 'No',
    'Cognitive Test Score': 90,
    'Depression Level': 'Low',
    'Sleep Quality': 'Poor',
    'Dietary Habits': 'Healthy',
    'Air Pollution Exposure': 'High',
    'Employment Status': 'Retured',
    'Marital Status': 'Single',
    'Genetic Risk Factor (APOE-ε4 allele)': 'No',
    'Social Engagement Level': 'Low',
    'Income Level': 'Medium',
    'Stress Levels': 'High',
    'Urban vs Rural Living': 'Urban'
}

# Alzheimer’s Diagnosis ==> No
sample_patient2 = {
        'Country': 'USA',
        'Age': 30,
        'Gender': 'Male',
        'Education Level': 12,
        'BMI': 26.5,
        'Physical Activity Level': 'Medium',
        'Smoking Status': 'Former',
        'Alcohol Consumption': 'Moderate',
        'Diabetes': 'No',
        'Hypertension': 'Yes',
        'Cholesterol Level': 'High',
        'Family History of Alzheimer’s': 'Yes',
        'Genetic Risk Factor (APOE-ε4 allele)': 'Yes',
        'Cognitive Test Score': 22,
        'Depression Level': 'Low',
        'Sleep Quality': 'Poor',
        'Dietary Habits': 'Average',
        'Employment Status': 'Retired',
        'Marital Status': 'Married',
        'Social Engagement Level': 'Medium',
        'Income Level': 'Middle',
        'Stress Levels': 'Medium',
        'Urban vs Rural Living': 'Urban',
        'Air Pollution Exposure': 'Medium'
    }
    
model, preprocessor, feature_info = load_models_and_preprocessor(model_types='rf')

result = predict_alzheimers(
        sample_patient,
        model_type='rf',
        model=model, 
        preprocessor=preprocessor, 
        feature_info=feature_info,
        threshold=0.6
    )
    
print(f"Prediction Result: {result}")
    

Loaded rf model from: models/alzheimers_rf_model.pkl
Loaded preprocessor and feature info from models folder
Prediction Result: {'diagnosis': 'No', 'probability': 0.5783215743096121, 'confidence': 0.42167842569038794, 'threshold_used': 0.6, 'model_used': 'RF', 'feature_importance': {'num__Age': np.float64(0.532863160495897), 'cat__Genetic Risk Factor (APOE-ε4 allele)_Yes': np.float64(0.07499109500973443), 'cat__Genetic Risk Factor (APOE-ε4 allele)_No': np.float64(0.06706637613135394), 'cat__Family History of Alzheimer’s_Yes': np.float64(0.036600016802217306), 'cat__Family History of Alzheimer’s_No': np.float64(0.034773984724978585), 'num__BMI': np.float64(0.024464707897405007), 'num__Cognitive Test Score': np.float64(0.021825002180314853), 'num__Education Level': np.float64(0.017660896145939874), 'cat__Country_Russia': np.float64(0.005330747689847775), 'cat__Country_India': np.float64(0.004808221185982292)}, 'risk_level': 'Moderate Risk'}


## Would the model performance improve if top 10 feature importance were used to train the models?