In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score, log_loss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
import joblib

original_df = pd.read_csv('Data/diabetes_012_health_indicators_BRFSS2015.csv')

y = original_df.Diabetes_012
X = original_df.drop('Diabetes_012', axis=1)
X.head(1)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0


In [2]:
# Numerical Predictors
numerical_cols = ['BMI', 'MentHlth', 'PhysHlth', 'Age']
# numerical_cols = list(dict.fromkeys(numerical_cols))  # Remove duplicates just in case

# Categorical Predictors
categorical_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
    'NoDocbcCost', 'GenHlth', 'DiffWalk', 'Sex', 'Education', 'Income'
]

# One-hot encode categorical features and drop the first category of each
# Feature Engineering - Turns categorical data into a numerical format that a model can understand
X_cat = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Extract the numerical features
X_num = X[numerical_cols]

# Concatenate numerical and encoded categorical features
X = pd.concat([X_num, X_cat], axis=1)

# Remove any duplicated columns that may result from concat
X = X.loc[:, ~X.columns.duplicated()]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaler will only fit on training data so, calculates Mean & STD and stores this on the scaler
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols]) # Uses scaler that has only seen the traning data -> standardize 

# Combine transformed training group +  assigned clean, matching row indices
df_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

In [3]:
# Removing 'MentHlth' from being an outlier as a single unique value is being treated as an outlier but, is a valid data point
numerical_cols2 = ['BMI', 'Age']

# Set a z-score threshold: ±1 SD → ~68.3% of data, ±2 SD → ~95.5% of data, ±3 SD → ~99.7% of data
threshold = 3    

# Calculate absolute z-scores for numerical columns
z_scores = np.abs(df_train[numerical_cols2])

# Mask: rows where any numerical feature has a z-score >= threshold of 3
outlier_mask = (z_scores >= threshold).any(axis=1)

# Get the outlier rows
outliers = df_train[outlier_mask]

# Get the cleaned training data (rows that are NOT outliers)
df_train_cleaned = df_train[~outlier_mask].reset_index(drop=True)

# Show the outliers
# print("Outlier rows removed from training data:")
# print(outliers)

# Print how many were removed
print(f"\nRemoved {len(outliers)} outliers from training data.")


Removed 2366 outliers from training data.


In [4]:
# Refit scaler on cleaned training data
scaler = StandardScaler()
df_train_cleaned[numerical_cols] = scaler.fit_transform(df_train_cleaned[numerical_cols])

# Apply same transformation to test data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [5]:
model = joblib.load("models/SVC_SMOTE_with_raw_data.joblib")

thresholds = [0.5, 0.4, 0.3, 0.2]
probs = model.predict_proba(X_test)[:, 1]

for t in thresholds:
    preds = (probs > t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, preds, digits=3))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



Threshold: 0.5
              precision    recall  f1-score   support

         0.0      0.847     0.945     0.893     42795
         1.0      0.026     0.082     0.039       944
         2.0      0.000     0.000     0.000      6997

    accuracy                          0.799     50736
   macro avg      0.291     0.342     0.311     50736
weighted avg      0.715     0.799     0.754     50736


Threshold: 0.4
              precision    recall  f1-score   support

         0.0      0.849     0.922     0.884     42795
         1.0      0.026     0.115     0.042       944
         2.0      0.000     0.000     0.000      6997

    accuracy                          0.780     50736
   macro avg      0.292     0.346     0.309     50736
weighted avg      0.716     0.780     0.746     50736


Threshold: 0.3
              precision    recall  f1-score   support

         0.0      0.851     0.889     0.870     42795
         1.0      0.025     0.161     0.043       944
         2.0      0.000    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


- Plot Precision-Recall Curve- Use it to find the threshold that gives you the best balance:
- This tells you how well your model handles the trade-offs and shows whether tuning the threshold actually has value — or if it’s all noise.

In [6]:
from sklearn.metrics import precision_recall_curve

prec, rec, thresh = precision_recall_curve(y_test, probs)
f1 = 2 * (prec * rec) / (prec + rec + 1e-10)

# Find best F1 score threshold
best_idx = np.argmax(f1)
best_threshold = thresh[best_idx]
print(f"Best threshold by F1: {best_threshold:.2f}")

ValueError: multiclass format is not supported