In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef

In [2]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [3]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=20)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# SMOTE
smote = SMOTE(random_state=20)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Initialize and train model
model = RandomForestClassifier(n_estimators=2500, min_samples_leaf=150, max_leaf_nodes=500, n_jobs=-1, random_state=20)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=20)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")

s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

Mean ROC AUC from CV: 0.9726
Test ROC AUC: 0.9227
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98       825
           1       0.35      0.83      0.49        23

    accuracy                           0.95       848
   macro avg       0.67      0.89      0.73       848
weighted avg       0.98      0.95      0.96       848

Feature male: 0.0135
Feature age: 0.0866
Feature education: 0.0537
Feature currentSmoker: 0.0999
Feature cigsPerDay: 0.0678
Feature BPMeds: 0.0104
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0035
Feature totChol: 0.0158
Feature sysBP: 0.0602
Feature diaBP: 0.0137
Feature BMI: 0.0298
Feature heartRate: 0.0313
Feature glucose: 0.5127
Feature TenYearCHD: 0.0010

Actual positives:
23

False Positives:
35

True positives:
19

False negatives:
4

True negatives:
790

MCC: 0.5213
1 ROC AUC from CV: 0.967952
2 ROC AUC from CV: 0.971602
3 ROC AUC from CV: 0.966500
4 ROC AUC from CV: 0.9766

In [4]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=20)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes").drop(columns="totChol")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes").drop(columns="totChol")
target_test = test_df["diabetes"]

# SMOTE
smote = SMOTE(random_state=20)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Initialize and train model
model = RandomForestClassifier(n_estimators=2500, min_samples_leaf=150, max_leaf_nodes=500, n_jobs=-1, random_state=20)

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=20)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {model.feature_names_in_[i]}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")
print(f"MCC: {matthews_corrcoef(target_test, y_pred):.4f}")

s = 1
for i in roc_auc_scores:
    print(f"{s} ROC AUC from CV: {i:4f}")
    s+=1
print(f"STD ROC AUC from CV: {roc_auc_scores.std():.4f}")

Mean ROC AUC from CV: 0.9733
Test ROC AUC: 0.9465
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       825
           1       0.33      0.83      0.47        23

    accuracy                           0.95       848
   macro avg       0.66      0.89      0.72       848
weighted avg       0.98      0.95      0.96       848

Feature male: 0.0104
Feature age: 0.0935
Feature education: 0.0608
Feature currentSmoker: 0.0883
Feature cigsPerDay: 0.0518
Feature BPMeds: 0.0094
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0035
Feature sysBP: 0.0686
Feature diaBP: 0.0167
Feature BMI: 0.0375
Feature heartRate: 0.0268
Feature glucose: 0.5317
Feature TenYearCHD: 0.0011

Actual positives:
23

False Positives:
38

True positives:
19

False negatives:
4

True negatives:
787

MCC: 0.5060
1 ROC AUC from CV: 0.967262
2 ROC AUC from CV: 0.974491
3 ROC AUC from CV: 0.972556
4 ROC AUC from CV: 0.972986
5 ROC AUC from CV: 0.