In [14]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier

In [15]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [16]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# Set up ratio


# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Define model parameters focusing on high recall
params = {
    'max_depth': 15,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'n_estimators': 40
}

# Initialize and train model
model = XGBClassifier(**params)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {i}: {j:.4f}")

# Display true positives and false negatives
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
actual_positives = test_df[target_test == 1]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")


Mean ROC AUC from CV: 0.9948
Test ROC AUC: 0.8828
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       832
           1       0.28      0.56      0.38        16

    accuracy                           0.96       848
   macro avg       0.64      0.77      0.68       848
weighted avg       0.98      0.96      0.97       848

Feature 0: 0.0151
Feature 1: 0.0582
Feature 2: 0.1131
Feature 3: 0.0337
Feature 4: 0.0369
Feature 5: 0.0718
Feature 6: 0.0000
Feature 7: 0.0141
Feature 8: 0.0219
Feature 9: 0.0419
Feature 10: 0.0154
Feature 11: 0.0190
Feature 12: 0.0265
Feature 13: 0.4907
Feature 14: 0.0418

Actual positives:
16

False Positives:
23

True positives:
9

False negatives:
7



In [8]:
y_scores = model.predict_proba(features_test)[:, 1]

# Get precision-recall values for different thresholds
precision, recall, thresholds = precision_recall_curve(target_test, y_scores)

# Find the threshold where recall is maximized (this is a naive approach, and in real applications you'd want a balance)
optimal_threshold = thresholds[np.argmax(recall)]

# Classify using the new threshold
y_pred_optimal = np.where(y_scores > optimal_threshold, 1, 0)

print("Classification Report with Optimized Threshold:\n", classification_report(target_test, y_pred_optimal))

# Display true positives and false negatives
true_positive_rows = test_df[(target_test == 1) & (y_pred_optimal == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred_optimal == 0)]
false_positive_rows = test_df[(target_test == 0) & (y_pred_optimal == 1)]
actual_positives = test_df[target_test == 1]

print(f"Actual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")

Classification Report with Optimized Threshold:
               precision    recall  f1-score   support

           0       1.00      0.11      0.19       832
           1       0.02      1.00      0.04        16

    accuracy                           0.12       848
   macro avg       0.51      0.55      0.12       848
weighted avg       0.98      0.12      0.19       848

