# Evaluation of Model Performance on Prediabetes Cases

In this notebook we analyzed how our models perform specifically on Prediabetes cases compared to the overall test dataset. Approximately 60% of individuals with Prediabetes are classified as "Diabetes." This means that more than half are classified "correctly" (in our context, as with Diabetes). Therefore, our initial assumption to merge the two classes was, at the very least, not incorrect.

In [1]:
# imports
import os
import sys

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # KBinsDiscretizer

sys.path.append(os.path.abspath("../scripts"))

import joblib
from binning import BinningTransformer

In [2]:
# Read data
df = pd.read_csv("../data/raw/diabetes_012_health_indicators_BRFSS2015.csv")
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
# create separate prediabetes column
df["Prediabetes"] = df["Diabetes_012"].apply(lambda x: 1 if x == 1 else 0)
df["Diabetes"] = df["Diabetes_012"].apply(lambda x: 1 if x == 2 else x)
df = df.drop(columns=["Diabetes_012"])
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Prediabetes,Diabetes
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,0,0.0
1,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,0,0.0
2,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,...,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,0,0.0
3,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,0,0.0
4,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,0,0.0


Applying the same preprocessing pipeline as in our preprocessing notebook

In [4]:
X = df.drop(columns=["Diabetes"], axis=1)
y = df["Diabetes"]

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Split the temp set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.66, random_state=42, stratify=y_temp
)

In [5]:
print(X_test[X_test["Prediabetes"] == 1].shape[0])

883


In [6]:
# Lists for different types of features
binary_features = [
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "DiffWalk",
    "Sex",
    "Prediabetes",
]  # no further preprocessing required
ordinal_features = [
    "GenHlth",
    "Age",
    "Education",
    "Income",
]  # no further preprocessing required
numerical_features = [
    "MentHlth",
    "PhysHlth",
]  # will be normalized
binned_features = ["BMI"]  # will be binned to 0-3

In [7]:
# Create bins for the BMI
bin_edges = [0, 18.5, 25, 30, df["BMI"].max() + 1]
num_bins = len(bin_edges) - 1
labels = list(range(num_bins))

# Define the preprocessing pipeline
binning_transformer = BinningTransformer(bins=bin_edges, labels=labels)
numerical_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        ("binned", binning_transformer, binned_features),
        ("binary", "passthrough", binary_features),
        ("ordinal", "passthrough", ordinal_features),
    ],
)

In [8]:
# Apply the preprocessing pipeline to the training and testing data
X_train_prep= preprocessor.fit_transform(X_train)
X_val_prep = preprocessor.transform(X_val)
X_test_prep = preprocessor.transform(X_test)

In [9]:
column_names = numerical_features + binned_features + binary_features + ordinal_features
X_test_prep = pd.DataFrame(X_test_prep, columns=column_names, index=X_test.index)

In [10]:
# get prediabetes instances
prediabetes_array = (X_test_prep["Prediabetes"] == 1).astype(int).values
prediabetes_array

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
# whole test set
X_test_prep = X_test_prep.drop(columns=["Prediabetes"], axis=1)

# test set only containing prediabetes instances
prediabetes_x_test = X_test_prep.loc[prediabetes_array == 1]
prediabetes_y_test = y_test.loc[prediabetes_array == 1]

print(prediabetes_x_test.shape)
print(prediabetes_y_test.shape)

(883, 21)
(883,)


Loading trained Models

In [12]:
# logistic regression
lr_model_filename = (
    "../models/logistic_regression/lg_model_cv_sampling_pca20241129_165533.pkl"
)
logistic_regression = joblib.load(lr_model_filename)

# naive bayes
naive_bayes_filename = (
    "../models/naive_bayes/nb_model_cv_f1_20241129_111525.pkl"
)
naive_bayes = joblib.load(naive_bayes_filename)

# support vector machine
svm_model_filename = "../models/support_vector_machine/svm_model_best-f1_20241129_100911.pkl"
svm = joblib.load(svm_model_filename)

# decision tree
dc_filename = "../models/decision_trees_ensembles/dt_best_model_cv_sampling_pca20241129_184533.pkl"
dc = joblib.load(dc_filename)

rf_filename = "../models/decision_trees_ensembles/dt_best_model_cv_sampling_pca20241129_184533.pkl"
rf = joblib.load(rf_filename)

# knn
knn_filename = "../models/knn/k=100_best_f1_postiv_20241128_204259.pkl"
knn = joblib.load(knn_filename)

# Nearest Centroid
nc_filename = "../models/nearest_centroid/best_20241127_201059.pkl"
nc = joblib.load(nc_filename)

# all models
model_dict = {
    "Logistic Regression": logistic_regression,
    "Decision Tree": dc,
    "Random Forest": rf,
    "Support Vector Machine": svm,
    "K-Nearest Neighbors": knn,
    "Nearest Centroid": nc,
    "Naive Bayes": naive_bayes,
}

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [14]:
# Evaluate the model's performance on the test set
results_list = []
for model_name, model in model_dict.items():
    y_test_pred = model.predict(X_test_prep)

    print(f"\n=== Model: {model_name} ===\n")
    accuracy_overall = accuracy_score(y_test, y_test_pred)

    # Evaluate the model's performance on the prediabetes test set
    prediabetes_y_test_pred = model.predict(prediabetes_x_test)
    report = classification_report(prediabetes_y_test, prediabetes_y_test_pred, digits=4, zero_division=0)
    print("Classification Report:\n", report)
    accuracy_prediabetes = accuracy_score(prediabetes_y_test, prediabetes_y_test_pred)
    accuracy_diff = accuracy_prediabetes - accuracy_overall
    print(f"Accuracy Difference: {accuracy_diff:.4f}")

    # Store the results in a list
    results_list.append({
        "Model": model_name,
        "Overall Accuracy": accuracy_overall,
        "Prediabetes Accuracy": accuracy_prediabetes,
        "Accuracy Difference": accuracy_diff
    })


=== Model: Logistic Regression ===

Classification Report:
               precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
         1.0     1.0000    0.6195    0.7650       883

    accuracy                         0.6195       883
   macro avg     0.5000    0.3097    0.3825       883
weighted avg     1.0000    0.6195    0.7650       883

Accuracy Difference: -0.1085

=== Model: Decision Tree ===

Classification Report:
               precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
         1.0     1.0000    0.6512    0.7888       883

    accuracy                         0.6512       883
   macro avg     0.5000    0.3256    0.3944       883
weighted avg     1.0000    0.6512    0.7888       883

Accuracy Difference: -0.0476

=== Model: Random Forest ===

Classification Report:
               precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
        

In [15]:
# Convert the list to a dataframe
results_df = pd.DataFrame(results_list)

results_df

Unnamed: 0,Model,Overall Accuracy,Prediabetes Accuracy,Accuracy Difference
0,Logistic Regression,0.727986,0.619479,-0.108507
1,Decision Tree,0.698799,0.651189,-0.04761
2,Random Forest,0.698799,0.651189,-0.04761
3,Support Vector Machine,0.479245,0.643262,0.164017
4,K-Nearest Neighbors,0.743196,0.563986,-0.17921
5,Nearest Centroid,0.690219,0.642129,-0.04809
6,Naive Bayes,0.735551,0.590034,-0.145517
