In [29]:
# ==============================================================================
# SCRIPT: 02 - Model Training, Comparison, and Selection (Notebook Version)
# AUTHOR: [Your Name]
# DATE:   17-Sep-2025
# ==============================================================================

# ==================================================
# 1. SETUP AND IMPORTS
# ==================================================
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os
import joblib
import numpy as np

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

# Imbalanced-learn imports
from imblearn.combine import SMOTETomek

# Other libraries
import shap
import seaborn as sns

# Jupyter display
from IPython.display import display

# ==================================================
# 2. PROJECT ROOT HANDLING (Notebook Safe)
# ==================================================
PROJECT_ROOT = Path(os.getcwd()).resolve().parent.parent
sys.path.append(str(PROJECT_ROOT))

# Import utility
try:
    from src.utils.data_loader import load_processed_data
except ImportError:
    def load_processed_data():
        """Fallback function to load processed data"""
        processed_data_path = PROJECT_ROOT / 'data' / 'processed' / 'processed_diabetes_data.csv'
        if processed_data_path.exists():
            return pd.read_csv(processed_data_path)
        return pd.DataFrame()

# --- Config / Paths ---
sns.set(style="whitegrid")
BEST_MODEL_FIG_DIR = PROJECT_ROOT / 'reports' / 'figures' / 'best_model_analysis'
BEST_MODEL_FIG_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(">> Setup complete.")

# ==================================================
# 3. LOAD PROCESSED DATA
# ==================================================
df = load_processed_data()
if df.empty:
    raise FileNotFoundError("Processed data not found. Run preprocessing first.")
else:
    print(f">> Data loaded successfully. Shape: {df.shape}")

# ==================================================
# 4. PREPARE MODELING DATAFRAME
# ==================================================
X_model = df.drop(["target", "id",'Systolic_BP', 'Diastolic_BP'], axis=1).copy()
y = df["target"].copy()

# --- Encoding ---
ordinal_cols = ["Cholesterol_Level", "Glucose_Level", "Smoking_Status",
                "Alcohol_Intake", "Physical_Activity", "BP_level"]
nominal_cols = ["Sex"]

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_model[ordinal_cols] = encoder.fit_transform(X_model[ordinal_cols])
X_model.rename(columns={col: f"ordinal_{col}" for col in ordinal_cols}, inplace=True)

ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown="ignore")
encoded_features = ohe.fit_transform(X_model[nominal_cols])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(nominal_cols), index=X_model.index)
X_model = X_model.drop(nominal_cols, axis=1)
X_model = pd.concat([X_model, encoded_df], axis=1)

print(">> Feature encoding complete. Final shape:", X_model.shape)

# ==================================================
# 5. TRAIN-TEST SPLIT AND SMOTETOMEK
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, random_state=42, stratify=y
)

smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote_tomek.fit_resample(X_train, y_train)
print(">> Train-test split and SMOTETomek completed.")
print(f"   Train shape: {X_train_res.shape}, Test shape: {X_test.shape}")

# ==================================================
# 6. DEFINE GENERIC TRAINING FUNCTION WITH EARLY STOPPING
# ==================================================
def train_with_early_stopping(model, X_train, y_train, X_val, y_val, early_stopping_rounds=20, verbose=True):
    model_name = type(model).__name__

    # XGBoost
    if model_name == "XGBClassifier":
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=verbose if verbose else 0
        )
        return model

    # LightGBM
    elif model_name == "LGBMClassifier":
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=verbose)]
        )
        return model

    # SVM large dataset sampling
    elif model_name == "SVC":
        if len(X_train) > 10000:
            sample_size = min(10000, len(X_train))
            idx = np.random.choice(len(X_train), sample_size, replace=False)
            X_sample, y_sample = X_train.iloc[idx], y_train.iloc[idx]
            model.fit(X_sample, y_sample)
        else:
            model.fit(X_train, y_train)
        return model

    # Other models
    else:
        model.fit(X_train, y_train)
        return model

# ==================================================
# 7. DEFINE MODELS (no class_weight, use SMOTETomek only)
# ==================================================
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1),
    'XGBoost': XGBClassifier(eval_metric="logloss", random_state=42, verbosity=0, n_jobs=-1, n_estimators=200),
    'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=200),
    'LightGBM': LGBMClassifier(n_estimators=200, random_state=42, n_jobs=-1, verbosity=-1),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

# ==================================================
# 8. TRAIN MODELS
# ==================================================
results = []
trained_models = {}

for name, model in models.items():
    print(f"--- Training {name} ---")
    try:
        trained_model = train_with_early_stopping(model, X_train_res, y_train_res, X_test, y_test)
        trained_models[name] = trained_model

        y_pred = trained_model.predict(X_test)

        try:
            if hasattr(trained_model, "predict_proba"):
                y_pred_proba = trained_model.predict_proba(X_test)[:, 1]
                roc_auc = roc_auc_score(y_test, y_pred_proba)
            else:
                y_pred_proba = np.nan
                roc_auc = np.nan
        except Exception as e:
            print(f"Warning: Could not calculate probabilities for {name}. Error: {str(e)}")
            y_pred_proba = np.nan
            roc_auc = np.nan

        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc
        })
    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        continue

# ==================================================
# 9. COMPARE MODELS AND SELECT BEST (ROC AUC first, then Recall)
# ==================================================
results_df = pd.DataFrame(results).sort_values(
    by=['ROC AUC', 'Recall'], ascending=False
).reset_index(drop=True)

print("\n>> Model Performance Summary:")
display(results_df)

best_model_name = results_df.iloc[0]['Model']
print(f"\n>> Best Model Selected (based on ROC AUC, then Recall): {best_model_name}")

>> Setup complete.
Loading processed data from: C:\Users\91833\end to end projects\diabetes prediction\data\processed\processed_heart_disease.csv
>> Data loaded successfully. Shape: (68361, 14)
>> Feature encoding complete. Final shape: (68361, 10)
>> Train-test split and SMOTETomek completed.
   Train shape: (46344, 10), Test shape: (13673, 10)
--- Training Logistic Regression ---
--- Training XGBoost ---
Error training XGBoost: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'
--- Training AdaBoost ---
--- Training LightGBM ---
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[25]	valid_0's binary_logloss: 0.552043
--- Training Random Forest ---
--- Training SVM ---

>> Model Performance Summary:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC AUC
0,LightGBM,0.724786,0.745591,0.674346,0.708181,0.791805
1,AdaBoost,0.719374,0.752496,0.645695,0.695016,0.783467
2,Logistic Regression,0.690997,0.706723,0.642741,0.673215,0.757674
3,SVM,0.690631,0.702858,0.650126,0.675464,0.752764
4,Random Forest,0.687998,0.685199,0.684389,0.684794,0.746188



>> Best Model Selected (based on ROC AUC, then Recall): LightGBM


In [30]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
# --- A. Classification Report ---
y_pred_best = trained_models[best_model_name].predict(X_test)
print("Classification Report for the Best Model:")
print(classification_report(y_test, y_pred_best))

# Get probability predictions for ROC curve
y_pred_proba_best = trained_models[best_model_name].predict_proba(X_test)[:, 1]

# --- B. Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease'])
plt.title(f'Confusion Matrix for {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig(BEST_MODEL_FIG_DIR / '1_confusion_matrix.png', bbox_inches='tight')
plt.close()
print(f"Confusion matrix plot saved.")

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_best)
auc = roc_auc_score(y_test, y_pred_proba_best)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.title(f'ROC Curve for {best_model_name}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.savefig(BEST_MODEL_FIG_DIR / '2_roc_auc_curve.png', bbox_inches='tight')
plt.close()
print(f"ROC AUC curve plot saved.")

Classification Report for the Best Model:
              precision    recall  f1-score   support

           0       0.71      0.77      0.74      6902
           1       0.75      0.67      0.71      6771

    accuracy                           0.72     13673
   macro avg       0.73      0.72      0.72     13673
weighted avg       0.73      0.72      0.72     13673

Confusion matrix plot saved.
ROC AUC curve plot saved.


In [31]:
# --- D. SHAP Analysis ---
# UPDATE: Added LGBM and AdaBoost to the TreeExplainer check
if isinstance(trained_models[best_model_name], (XGBClassifier, RandomForestClassifier, LGBMClassifier, AdaBoostClassifier)):
    explainer = shap.TreeExplainer(trained_models[best_model_name])
    shap_values = explainer.shap_values(X_test)
    print("Calculating SHAP values with TreeExplainer...")
else:
    explainer = shap.KernelExplainer(trained_models[best_model_name].predict_proba, shap.sample(X_train_res, 100))
    shap_values = explainer.shap_values(X_test)[1]
    print("Calculating SHAP values with KernelExplainer (this may be slow)...")

shap.summary_plot(shap_values, X_test, show=False)
plt.title(f"SHAP Summary for {best_model_name}")
plt.savefig(BEST_MODEL_FIG_DIR / '3_shap_summary.png', bbox_inches='tight')
plt.close()
print(f"SHAP summary plot saved.")



Calculating SHAP values with TreeExplainer...
SHAP summary plot saved.


In [33]:
# ==================================================
# 8. SAVE THE BEST MODEL
# ==================================================
print("\n" + "="*50)
print("8. Saving the Best Model")
print("="*50)

model_path = MODELS_DIR / f'best_model_{best_model_name.lower().replace(" ", "_")}.joblib'
joblib.dump(best_model_name, model_path)
print(f" Best model saved to: {model_path}")

print("\nModel Training Script Finished Successfully.")


8. Saving the Best Model
 Best model saved to: C:\Users\91833\end to end projects\diabetes prediction\models\best_model_lightgbm.joblib

Model Training Script Finished Successfully.
