In [2]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve

In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearnex import patch_sklearn 

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [57]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# Set up ratio


# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Initialize and train model
tree = DecisionTreeClassifier(max_depth=1)
#svc=SVC(probability=True)

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=1, estimator=tree, learning_rate=0.005, random_state=42)
model = abc.fit(features_train,target_train)

In [58]:
# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
for i, j in enumerate(importances):
    print(f"Feature {i}: {j:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.8739
Test ROC AUC: 0.8287
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       832
           1       0.31      0.69      0.42        16

    accuracy                           0.96       848
   macro avg       0.65      0.83      0.70       848
weighted avg       0.98      0.96      0.97       848

Feature 0: 0.0000
Feature 1: 0.0000
Feature 2: 0.0000
Feature 3: 0.0000
Feature 4: 0.0000
Feature 5: 0.0000
Feature 6: 0.0000
Feature 7: 0.0000
Feature 8: 0.0000
Feature 9: 0.0000
Feature 10: 0.0000
Feature 11: 0.0000
Feature 12: 0.0000
Feature 13: 1.0000
Feature 14: 0.0000

Actual positives:
16

False Positives:
25

True positives:
11

False negatives:
5

True negatives:
807



In [108]:
y_scores = model.predict_proba(features_test)[:, 1]

# Get precision-recall values for different thresholds
precision, recall, thresholds = precision_recall_curve(target_test, y_scores)

# Find the threshold where recall is maximized (this is a naive approach, and in real applications you'd want a balance)
optimal_threshold = thresholds[np.argmax(recall)]

# Classify using the new threshold
y_pred_optimal = np.where(y_scores > optimal_threshold, 1, 0)

print("Classification Report with Optimized Threshold:\n", classification_report(target_test, y_pred_optimal))


# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred_optimal == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred_optimal == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred_optimal == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred_optimal == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Classification Report with Optimized Threshold:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00       832
           1       0.02      1.00      0.04        16

    accuracy                           0.02       848
   macro avg       0.51      0.50      0.02       848
weighted avg       0.98      0.02      0.00       848


Actual positives:
16

False Positives:
831

True positives:
16

False negatives:
0

True negatives:
1

