<a href="https://www.kaggle.com/code/aabdollahii/diabetes-prediction?scriptVersionId=269770477" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#pip install -U imbalanced-learn

In [None]:
# --- 1. Import Necessary Libraries ---
print("--- 1. Importing Libraries ---")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Preprocessing and Feature Selection
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Metrics and Imbalance Handling
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE

# Model Interpretation
import shap
import lime
import lime.lime_tabular

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
print("Libraries imported successfully.\n" + "="*80)

# --- 2. Load and Explore Primary Dataset ---
print("\n--- 2. Loading and Exploring the Primary Dataset ---")
try:
    df_primary = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')
except FileNotFoundError:
    print("Error: 'diabett.csv' not found. Please ensure the dataset is in the same directory.")
    exit()

print("Primary dataset loaded. Shape:", df_primary.shape)
print(df_primary.head())
df_primary.info()

# Handle the 'Other' category in gender by removing it for simplicity
df_primary = df_primary[df_primary['gender'] != 'Other'].copy()
print("\n'Other' gender category removed.")
print("\n" + "="*80)

# --- 3. Feature Engineering on Primary Dataset ---
print("\n--- 3. Performing Feature Engineering ---")
# a) Binning BMI into categories
bins_bmi = [0, 18.5, 24.9, 29.9, 100]
labels_bmi = ['Underweight', 'Normal', 'Overweight', 'Obese']
df_primary['bmi_category'] = pd.cut(df_primary['bmi'], bins=bins_bmi, labels=labels_bmi)

# b) Binning Age into groups
bins_age = [0, 12, 19, 39, 59, 100]
labels_age = ['Child', 'Teenager', 'Adult', 'Middle_Aged', 'Senior']
df_primary['age_group'] = pd.cut(df_primary['age'], bins=bins_age, labels=labels_age)

print("New categorical features 'bmi_category' and 'age_group' created.")
print(df_primary[['age', 'age_group', 'bmi', 'bmi_category']].head())
print("\n" + "="*80)

# --- 4. Data Preparation for Modeling ---
print("\n--- 4. Preparing Data in Different Formats for Modeling ---")
X = df_primary.drop('diabetes', axis=1)
y = df_primary['diabetes']

# Data with one-hot encoding for all models
categorical_features = ['gender', 'smoking_history', 'bmi_category', 'age_group']
X_one_hot = pd.get_dummies(X, columns=categorical_features, drop_first=True)

print("Data preparation complete.")
print("\n" + "="*80)

In [None]:
# --- 5. Data Splitting and SMOTE ---
print("\n--- 5. Splitting Data and Handling Class Imbalance with SMOTE ---")
X_train_oh, X_test_oh, y_train, y_test = train_test_split(X_one_hot, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original training data distribution:\n{y_train.value_counts(normalize=True)}")

# Apply SMOTE to the numerical (one-hot encoded) training data.
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_oh, y_train)

print(f"\nSMOTE-balanced training data distribution:\n{y_train_smote.value_counts(normalize=True)}")
print("\n" + "="*80)

# --- 6. Model Training and Hyperparameter Tuning ---
print("\n--- 6. Training and Tuning Models on Primary Dataset ---")
models_to_tune = {
    "RandomForest": (RandomForestClassifier(random_state=42), {
        'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]
    }),
    "XGBoost": (XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist'), {
        'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.05, 0.1]
    })
}
best_estimators = {}
for name, (model, params) in models_to_tune.items():
    print(f"\nTuning {name}...")
    search = RandomizedSearchCV(model, params, n_iter=4, cv=3, scoring='roc_auc', random_state=42, n_jobs=-1, verbose=1)
    search.fit(X_train_smote, y_train_smote)
    best_estimators[name] = search.best_estimator_
    print(f"Best parameters for {name}: {search.best_params_}")

print("\nTraining SVM (on CPU)...")
svm_pipeline = Pipeline([('scaler', StandardScaler()), ('svm', SVC(kernel='rbf', probability=True, random_state=42))])
svm_pipeline.fit(X_train_smote, y_train_smote)
best_estimators["SVM"] = svm_pipeline

print("\nTraining CatBoost (on GPU)...")
cat_model = CatBoostClassifier(random_state=42, verbose=0, task_type='GPU')
cat_model.fit(X_train_smote, y_train_smote)
best_estimators["CatBoost"] = cat_model
print("\n" + "="*80)



In [None]:
# --- 8. Final Model Evaluation on Primary Test Set ---
print("\n--- 8. Final Evaluation on Primary Dataset's Test Set ---")
roc_data = {}
for name, model in best_estimators.items():
    print(f"\n--- {name} Performance ---")
    y_pred = model.predict(X_test_oh)
    y_proba = model.predict_proba(X_test_oh)[:, 1]

    roc_data[name] = (y_proba, y_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    print(f"ROC AUC Score: {auc(fpr, tpr):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# --- ROC Curve Comparison ---
plt.figure(figsize=(12, 10))
for name, (y_proba, y_true) in roc_data.items():
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Chance (AUC = 0.50)')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison on Primary Dataset', fontsize=16)
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
print("\n" + "="*80)


# --- 9. Model Interpretation (SHAP & LIME) ---
print("\n--- 9. Interpreting the Best Model (XGBoost) ---")
# SHAP for Global Interpretation
try:
    best_xgb_model = best_estimators['XGBoost']
    best_xgb_model.set_params(predictor='cpu_predictor')

    explainer = shap.TreeExplainer(best_xgb_model)
    shap_values = explainer.shap_values(X_test_oh)

    print("\nSHAP Feature Importance:")
    shap.summary_plot(shap_values, X_test_oh, plot_type="bar", show=False)
    plt.title("SHAP Feature Importance (XGBoost)")
    plt.show()
except Exception as e:
    print(f"Could not generate SHAP plots. Error: {e}")

# LIME for Local Interpretation
try:
    predict_fn_xgb = lambda x: best_estimators['XGBoost'].predict_proba(x)

    categorical_names = {}
    categorical_features_indices = []
    for i, col in enumerate(X_train_smote.columns):
        if '_' in col:
            original_feature = col.split('_')[0]
            if original_feature not in categorical_names:
                categorical_names[original_feature] = []
            categorical_names[original_feature].append(i)
            categorical_features_indices.append(i)

    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X_train_smote.values,
        feature_names=X_train_smote.columns.tolist(),
        class_names=['Not Diabetic', 'Diabetic'],
        categorical_features=categorical_features_indices,
        categorical_names=categorical_names,
        mode='classification'
    )
    true_positives = X_test_oh[(y_test == 1) & (best_estimators['XGBoost'].predict(X_test_oh) == 1)]
    if not true_positives.empty:
        instance_to_explain_tp = true_positives.iloc[0]
        print("\nLIME Explanation for a True Positive case:")
        explanation_tp = explainer.explain_instance(instance_to_explain_tp.values, predict_fn_xgb, num_features=10)
        explanation_tp.as_pyplot_figure()
        plt.suptitle("LIME: True Positive Explanation", y=1.02)
        plt.show()
except Exception as e:
    print(f"Could not generate LIME plots. Error: {e}")
print("\n" + "="*80)
