In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import os
import joblib

In [2]:
original_df = pd.read_csv('Data/diabetes_012_health_indicators_BRFSS2015.csv')

print('Distribution for the values of for the target variable\n')
print(original_df["Diabetes_012"].value_counts())

Distribution for the values of for the target variable

Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64


In [3]:
y = original_df.Diabetes_012
X = original_df.drop('Diabetes_012', axis=1)
X.head(1)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0


In [4]:
# Numerical Predictors
numerical_cols = ['BMI', 'MentHlth', 'PhysHlth', 'Age']
# numerical_cols = list(dict.fromkeys(numerical_cols))  # Remove duplicates just in case

# Categorical Predictors
categorical_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
    'NoDocbcCost', 'GenHlth', 'DiffWalk', 'Sex', 'Education', 'Income'
]

# One-hot encode categorical features and drop the first category of each
X_cat = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Extract the numerical features
X_num = X[numerical_cols]

# Concatenate numerical and encoded categorical features
X = pd.concat([X_num, X_cat], axis=1)

# Remove any duplicated columns that may result from concat
X = X.loc[:, ~X.columns.duplicated()]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaler will only fit on training data so, calculates Mean & STD and stores this on the scaler
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols]) # Uses scaler that has only seen the traning data -> standardize 

# Combine transformed training group +  assigned clean, matching row indices
df_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

In [5]:
# Removing 'MentHlth' from being an outlier as a single unique value is being treated as an outlier but, is a valid data point
numerical_cols2 = ['BMI', 'PhysHlth', 'Age']

# Set a z-score threshold: ±1 SD → ~68.3% of data, ±2 SD → ~95.5% of data, ±3 SD → ~99.7% of data
threshold = 3    

# Calculate absolute z-scores for numerical columns
z_scores = np.abs(df_train[numerical_cols2])

# Mask: rows where any numerical feature has a z-score >= threshold of 3
outlier_mask = (z_scores >= threshold).any(axis=1)

# Get the outlier rows
outliers = df_train[outlier_mask]

# Get the cleaned training data (rows that are NOT outliers)
df_train_cleaned = df_train[~outlier_mask].reset_index(drop=True)

# Show the outliers
# print("Outlier rows removed from training data:")
# print(outliers)

# Print how many were removed
print(f"\nRemoved {len(outliers)} outliers from training data.")


Removed 2366 outliers from training data.


In [6]:
# Refit scaler on cleaned training data
scaler = StandardScaler()
df_train_cleaned[numerical_cols] = scaler.fit_transform(df_train_cleaned[numerical_cols])

# Apply same transformation to test data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Summary of the original standardized training data (before outlier removal)
print("Original Standardized Training Data:")
print(df_train[numerical_cols].describe())

# Summary of the re-standardized, cleaned training data
print("\nCleaned + Re-standardized Training Data:")
print(df_train_cleaned[numerical_cols].describe())

Original Standardized Training Data:
                BMI      MentHlth      PhysHlth           Age
count  2.029440e+05  2.029440e+05  2.029440e+05  2.029440e+05
mean   2.687503e-16  4.513016e-17 -7.142400e-18 -1.776497e-16
std    1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00
min   -2.475946e+00 -4.299017e-01 -4.871967e-01 -2.303520e+00
25%   -6.611649e-01 -4.299017e-01 -4.871967e-01 -6.658286e-01
50%   -2.074697e-01 -4.299017e-01 -4.871967e-01 -1.075203e-02
75%    3.974572e-01 -1.606003e-01 -1.431950e-01  6.443245e-01
max    1.052998e+01  3.609619e+00  2.952820e+00  1.626939e+00

Cleaned + Re-standardized Training Data:
                BMI      MentHlth      PhysHlth           Age
count  2.005780e+05  2.005780e+05  2.005780e+05  2.005780e+05
mean   8.785340e-18 -9.201581e-17  2.897745e-17 -3.276790e-17
std    1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00
min   -2.855967e+00 -4.273583e-01 -4.833907e-01 -2.305616e+00
25%   -7.155151e-01 -4.273583e-01 -4.833907e-01 -6.69

In [7]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Use the cleaned dataset (after removing outliers)
X_train_smote = df_train_cleaned.drop('Diabetes_012', axis=1)
y_train_smote = df_train_cleaned['Diabetes_012']

# Check original class distribution
print("Before SMOTE oversampling:", Counter(y_train_smote))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_smote, y_train_smote)

# Check class distribution after SMOTE
print("After SMOTE oversampling:", Counter(y_smote))

# Optional: Also apply SMOTE to raw data (no outlier removal)
X_train_smote_raw = df_train.drop('Diabetes_012', axis=1)
y_train_smote_raw = df_train['Diabetes_012']

smote_raw = SMOTE(random_state=42)
X_smote_raw, y_smote_raw = smote_raw.fit_resample(X_train_smote_raw, y_train_smote_raw)

print("After SMOTE on raw (no outlier removal):", Counter(y_smote_raw))


Before SMOTE oversampling: Counter({0.0: 169449, 2.0: 27521, 1.0: 3608})
After SMOTE oversampling: Counter({0.0: 169449, 2.0: 169449, 1.0: 169449})
After SMOTE on raw (no outlier removal): Counter({0.0: 170908, 2.0: 170908, 1.0: 170908})


In [8]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    'SVC': SVC(probability=True, class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'OneVsRest_LogRegCV': OneVsRestClassifier(
        LogisticRegressionCV(
            class_weight='balanced',
            cv=3,
            max_iter=200,
            penalty='l1',
            random_state=42,
            scoring='roc_auc',
            solver='liblinear'
        )
    )
}

In [9]:
def evaluate_models(models, X_train, y_train, X_test, y_test, label=""):
    """
    Trains, evaluates, and saves multiple classification models.
    Prints accuracy, classification report, and confusion matrix.
    
    Parameters:
    - models: dict of model name → model object
    - X_train, y_train: training features and labels
    - X_test, y_test: test features and labels
    - label: string label to tag the evaluation (e.g., 'SMOTE_cleaned')
    """
    # Create a directory to store models
    save_dir = f"saved_models/{label.replace(' ', '_')}"
    os.makedirs(save_dir, exist_ok=True)

    for name, model in models.items():
        print(f"\n🔹 Model: {name} ({label})")

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Save the model
        filename = os.path.join(save_dir, f"{name}_{label.replace(' ', '_')}.joblib")
        joblib.dump(model, filename)
        print(f"Saved model to: {filename}")

        # Evaluate
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print("\nClassification Report:\n", classification_report(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=[f'Actual {i}' for i in np.unique(y_test)],
                                columns=[f'Predicted {i}' for i in np.unique(y_test)])
        print("\nConfusion Matrix:\n", cm_df)


In [10]:
# Evaluate using outlier-removed training data
evaluate_models(models, X_smote, y_smote, X_test, y_test, label="SMOTE with cleaned data")

# Evaluate using outlier-inclusive training data
evaluate_models(models, X_smote_raw, y_smote_raw, X_test, y_test, label="SMOTE with raw data")


🔹 Model: SVC (SMOTE with cleaned data)
Saved model to: saved_models/SMOTE_with_cleaned_data\SVC_SMOTE_with_cleaned_data.joblib
Accuracy: 0.7096

Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.74      0.82     42795
         1.0       0.03      0.12      0.04       944
         2.0       0.34      0.60      0.43      6997

    accuracy                           0.71     50736
   macro avg       0.43      0.48      0.43     50736
weighted avg       0.83      0.71      0.76     50736


Confusion Matrix:
             Predicted 0.0  Predicted 1.0  Predicted 2.0
Actual 0.0          31717           3356           7722
Actual 1.0            415            110            419
Actual 2.0           2002            821           4174

🔹 Model: RandomForest (SMOTE with cleaned data)
Saved model to: saved_models/SMOTE_with_cleaned_data\RandomForest_SMOTE_with_cleaned_data.joblib
Accuracy: 0.8086

Classification Report:
               pre

In [11]:
# def evaluate_models(models, X_train, y_train, X_test, y_test, label=""):
#     """
#     Trains and evaluates a dictionary of classification models.

#     For each model, it prints the accuracy, classification report, and a labeled confusion matrix
#     using the provided training and test datasets. 
    
#     Parameters:
#     - models: dict of model name → model object
#     - X_train, y_train: training features and labels
#     - X_test, y_test: test features and labels
#     - label: optional string to tag the evaluation (e.g., 'with outliers', 'without outliers')
#     """
#     for name, model in models.items():
#         print(f"\n🔹 Model: {name} ({label})")

#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)

#         acc = accuracy_score(y_test, y_pred)
#         print(f"Accuracy: {acc:.4f}")
#         print("\nClassification Report:\n", classification_report(y_test, y_pred))

#         cm = confusion_matrix(y_test, y_pred)
#         cm_df = pd.DataFrame(cm, index=[f'Actual {i}' for i in np.unique(y_test)],
#                                 columns=[f'Predicted {i}' for i in np.unique(y_test)])
#         print("\nConfusion Matrix:\n", cm_df)