In [10]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve, precision_score, recall_score
from sklearn.metrics import matthews_corrcoef

In [2]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
#from sklearnex import patch_sklearn 
from IPython.display import clear_output
#patch_sklearn()

In [3]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [22]:
m_d = 15
min_m_d = 15
max_m_d = 30
inc_m_d = 1
r_s = 21
m_l_n = 16
min_m_l_n = 5
max_m_l_n = 30
inc_m_l_n = 1
n_j = -1 #const
n_e = 20
min_n_e = 20
max_n_e = 200
inc_n_e = 5
l_r = .56
min_l_r = 0.02
max_l_r = 1.00
inc_l_r = 0.02

highest_pra_sofar = [0,0,0]
bests=[]
near_bests=[]
best_sofar = [0,0,0,0,0]

In [23]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# SMOTE
smote = SMOTE(random_state=r_s)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

In [24]:
def runThroughHP(m_d, n_e, m_l_n, l_r):
    # Initialize and train model
    tree = []
    tree.append(DecisionTreeClassifier(max_depth=m_d, random_state=r_s))
    tree.append(RandomForestClassifier(n_estimators=n_e, max_leaf_nodes=m_l_n, n_jobs=-1, random_state=r_s))
    #svc=SVC(probability=True)
    
    # Create adaboost classifer object
    abc = AdaBoostClassifier(n_estimators=n_e, estimator=tree[1], learning_rate=l_r, random_state=r_s)
    model = abc.fit(features_train,target_train)
    
    
    # Cross-validation using the resampled training data
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=r_s)
    roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
    
    # Train the model on the resampled training data
    model.fit(features_train_resampled, target_train_resampled)
    
    # Evaluate on the test set
    y_pred_proba = model.predict_proba(features_test)[:, 1]
    roc_auc = roc_auc_score(target_test, y_pred_proba)
    y_pred = model.predict(features_test)
    ps = precision_score(target_test, y_pred)
    rs = recall_score(target_test, y_pred)
    a_s = accuracy_score(target_test, y_pred)
    if(ps > .75 and rs == 1.00):
        bests.append([a_s, m_d, n_e, m_l_n, l_r])
    elif(ps > .6 and rs >= 0.875):
        near_bests.append([a_s, m_d, n_e, m_l_n, l_r])
    global highest_pra_sofar
    global best_sofar
    if((rs) > highest_pra_sofar[1] or (ps > highest_pra_sofar[0] and rs == highest_pra_sofar[1])):
        highest_pra_sofar = [ps,rs,a_s]
        best_sofar = [a_s, m_d, n_e, m_l_n, l_r]

In [None]:
count = 0
for i in range(min_m_d, max_m_d, inc_m_d):
    for j in range(min_n_e, max_n_e, inc_n_e):
        for k in range(min_m_l_n, max_m_l_n, inc_m_l_n):
            for l in np.arange(min_l_r, max_l_r, inc_l_r):
                runThroughHP(i,j,k,l)
                clear_output(wait=True)
                count+=1
                print(f" Count: {count}\n max_depth: {i}\n n_estimators: {j}\n max_leaf_nodes: {k}\n learning_rate {l}\n Best P,R,A so far: {highest_pra_sofar}\n best so far: {best_sofar}\n Near Bests: {len(near_bests)}\n Bests: {len(bests)}")
print("bests:")
print(bests)
print("near bests:")
print(near_bests)

best_acc = []
if len(bests) > 0:
    for i in bests:
        best_acc.append(i[0])
    np_bests = np.array(best_acc)
    
    highestacc_index = np.where(a == np_bests.max())
    
    m_d = bests[highestacc_index][1]
    n_e = bests[highestacc_index][2]
    m_l_n = bests[highestacc_index][3]
    l_r = bests[highestacc_index][4]
elif len(near_bests) > 0:
    for i in near_bests:
        best_acc.append(i[0])
    np_bests = np.array(best_acc)
    
    highestacc_index = np.where(a == np_bests.max())
    
    m_d = near_bests[highestacc_index][1]
    n_e = near_bests[highestacc_index][2]
    m_l_n = near_bests[highestacc_index][3]
    l_r = near_bests[highestacc_index][4]


 Count: 5854
 max_depth: 15
 n_estimators: 40
 max_leaf_nodes: 24
 learning_rate 0.46
 Best P,R,A so far: [0.36363636363636365, 0.75, 0.9705188679245284]
 best so far: [0.9705188679245284, 15, 30, 28, 0.02]
 Near Bests: 0
 Bests: 0


In [25]:
# Initialize and train model
tree = []
tree.append(DecisionTreeClassifier(max_depth=m_d, random_state=r_s))
tree.append(RandomForestClassifier(n_estimators=n_e, max_leaf_nodes=m_l_n, n_jobs=-1, random_state=r_s))
#svc=SVC(probability=True)

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=n_e, estimator=tree[1], learning_rate=l_r, random_state=r_s)
model = abc.fit(features_train,target_train)


# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=r_s)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred, target_names=["Non-diabetic", "Diabetic"]))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9990
MCC: 0.5356
Test ROC AUC: 0.8724
Classification Report:
               precision    recall  f1-score   support

Non-diabetic       0.99      0.99      0.99       832
    Diabetic       0.48      0.62      0.54        16

    accuracy                           0.98       848
   macro avg       0.73      0.81      0.77       848
weighted avg       0.98      0.98      0.98       848

Feature male: 0.0137
Feature age: 0.0770
Feature education: 0.1498
Feature currentSmoker: 0.0396
Feature cigsPerDay: 0.0775
Feature BPMeds: 0.0172
Feature prevalentStroke: 0.0023
Feature prevalentHyp: 0.0074
Feature totChol: 0.0828
Feature sysBP: 0.0883
Feature diaBP: 0.0679
Feature BMI: 0.0618
Feature heartRate: 0.0859
Feature glucose: 0.2180
Feature TenYearCHD: 0.0109

Actual positives:
16

False Positives:
11

True positives:
10

False negatives:
6

True negatives:
821



In [17]:
y_scores = model.predict_proba(features_test)[:, 1]

# Get precision-recall values for different thresholds
precision, recall, thresholds = precision_recall_curve(target_test, y_scores)

# Find the threshold where recall is maximized (this is a naive approach, and in real applications you'd want a balance)
optimal_threshold = thresholds[np.argmax(recall)]

# Classify using the new threshold
y_pred_optimal = np.where(y_scores > optimal_threshold, 1, 0)

print("Classification Report with Optimized Threshold:\n", classification_report(target_test, y_pred_optimal))


# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred_optimal == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred_optimal == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred_optimal == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred_optimal == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Classification Report with Optimized Threshold:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00       832
           1       0.02      1.00      0.04        16

    accuracy                           0.02       848
   macro avg       0.51      0.50      0.02       848
weighted avg       0.98      0.02      0.00       848


Actual positives:
16

False Positives:
831

True positives:
16

False negatives:
0

True negatives:
1

