In [1]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef

In [2]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [6]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# Set up ratio

# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

class_counts = target_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
# Define model parameters focusing on high recall
params = {
    'max_depth': 15,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'n_estimators': 10,
    'scale_pos_weight' : scale_pos_weight
}

# Initialize and train model
model = XGBClassifier(**params)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")
s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9899
1 ROC AUC from CV: 0.983971
2 ROC AUC from CV: 0.986022
3 ROC AUC from CV: 0.994139
4 ROC AUC from CV: 0.990225
5 ROC AUC from CV: 0.992053
6 ROC AUC from CV: 0.988913
7 ROC AUC from CV: 0.993716
STD ROC AUC from CV: 0.0036
MCC: 0.3364
Test ROC AUC: 0.8453
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       832
           1       0.25      0.50      0.33        16

    accuracy                           0.96       848
   macro avg       0.62      0.74      0.66       848
weighted avg       0.98      0.96      0.97       848

Feature male: 0.0063
Feature age: 0.0428
Feature education: 0.0918
Feature currentSmoker: 0.0487
Feature cigsPerDay: 0.0568
Feature BPMeds: 0.0696
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0102
Feature totChol: 0.0175
Feature sysBP: 0.0488
Feature diaBP: 0.0168
Feature BMI: 0.0171
Feature heartRate: 0.0259
Feature glucose: 0.5344
Feature TenYearCH

In [None]:
class_counts = target_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]

In [5]:
y_scores = model.predict_proba(features_test)[:, 1]

# Get precision-recall values for different thresholds
precision, recall, thresholds = precision_recall_curve(target_test, y_scores)

# Find the threshold where recall is maximized (this is a naive approach, and in real applications you'd want a balance)
optimal_threshold = thresholds[np.argmax(recall)]

# Classify using the new threshold
y_pred_optimal = np.where(y_scores > optimal_threshold, 1, 0)

print("Classification Report with Optimized Threshold:\n", classification_report(target_test, y_pred_optimal))


# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred_optimal == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred_optimal == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred_optimal == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred_optimal == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Classification Report with Optimized Threshold:
               precision    recall  f1-score   support

           0       1.00      0.11      0.19       832
           1       0.02      1.00      0.04        16

    accuracy                           0.12       848
   macro avg       0.51      0.55      0.12       848
weighted avg       0.98      0.12      0.19       848


Actual positives:
16

False Positives:
744

True positives:
16

False negatives:
0

True negatives:
88



In [6]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes").drop(columns="totChol")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes").drop(columns="totChol")
target_test = test_df["diabetes"]

# Set up ratio

# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Define model parameters focusing on high recall
params = {
    'max_depth': 15,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'n_estimators': 40
}

# Initialize and train model
model = XGBClassifier(**params)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")
s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9933
1 ROC AUC from CV: 0.988449
2 ROC AUC from CV: 0.994773
3 ROC AUC from CV: 0.992816
4 ROC AUC from CV: 0.995077
5 ROC AUC from CV: 0.992186
6 ROC AUC from CV: 0.993890
7 ROC AUC from CV: 0.996069
STD ROC AUC from CV: 0.0023
MCC: 0.4508
Test ROC AUC: 0.8663
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       832
           1       0.34      0.62      0.44        16

    accuracy                           0.97       848
   macro avg       0.67      0.80      0.71       848
weighted avg       0.98      0.97      0.97       848

Feature male: 0.0318
Feature age: 0.0234
Feature education: 0.1247
Feature currentSmoker: 0.0814
Feature cigsPerDay: 0.0802
Feature BPMeds: 0.0095
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0279
Feature sysBP: 0.0561
Feature diaBP: 0.0177
Feature BMI: 0.0196
Feature heartRate: 0.0306
Feature glucose: 0.4780
Feature TenYearCHD: 0.0192

Actual positi