In [1]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef

In [2]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [9]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# Set up ratio

# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

class_counts = target_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
# Define model parameters focusing on high recall
params = {
    'max_depth': 15,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'n_estimators': 10,
    'scale_pos_weight' : scale_pos_weight
}

# Initialize and train model
model = XGBClassifier(**params)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")
s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9699
1 ROC AUC from CV: 0.977187
2 ROC AUC from CV: 0.972658
3 ROC AUC from CV: 0.975058
4 ROC AUC from CV: 0.971882
5 ROC AUC from CV: 0.969751
6 ROC AUC from CV: 0.961387
7 ROC AUC from CV: 0.961346
STD ROC AUC from CV: 0.0058
MCC: 0.2153
Test ROC AUC: 0.8730
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.84      0.91       832
           1       0.08      0.75      0.15        16

    accuracy                           0.84       848
   macro avg       0.54      0.80      0.53       848
weighted avg       0.98      0.84      0.90       848

Feature male: 0.0841
Feature age: 0.1492
Feature education: 0.1639
Feature currentSmoker: 0.0036
Feature cigsPerDay: 0.0729
Feature BPMeds: 0.0000
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0068
Feature totChol: 0.0597
Feature sysBP: 0.0259
Feature diaBP: 0.0364
Feature BMI: 0.1540
Feature heartRate: 0.0658
Feature glucose: 0.1209
Feature TenYearCH

In [13]:


# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes").drop(columns="totChol")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes").drop(columns="totChol")
target_test = test_df["diabetes"]

# Set up ratio

# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

class_counts = target_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
# Define model parameters focusing on high recall
params = {
    'max_depth': 15,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'n_estimators': 10,
    'scale_pos_weight' : scale_pos_weight
}

# Initialize and train model
model = XGBClassifier(**params)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")
s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9715
1 ROC AUC from CV: 0.972932
2 ROC AUC from CV: 0.977716
3 ROC AUC from CV: 0.971115
4 ROC AUC from CV: 0.973971
5 ROC AUC from CV: 0.973233
6 ROC AUC from CV: 0.965579
7 ROC AUC from CV: 0.966009
STD ROC AUC from CV: 0.0041
MCC: 0.2235
Test ROC AUC: 0.8793
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.88      0.93       832
           1       0.10      0.69      0.17        16

    accuracy                           0.87       848
   macro avg       0.54      0.78      0.55       848
weighted avg       0.98      0.87      0.92       848

Feature male: 0.0072
Feature age: 0.1516
Feature education: 0.2197
Feature currentSmoker: 0.0000
Feature cigsPerDay: 0.1641
Feature BPMeds: 0.0016
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0000
Feature sysBP: 0.1522
Feature diaBP: 0.0674
Feature BMI: 0.0239
Feature heartRate: 0.0274
Feature glucose: 0.0937
Feature TenYearCHD: 0.0913

Actual positi