In [3]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import lightgbm
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_recall_curve

In [4]:
filename = os.path.join("framingham.csv")
df = pd.read_csv(filename)

In [20]:
# Impute missing values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Split the dataset 80/20
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# Isolate the features and target
features_train = train_df.drop(columns="diabetes")
target_train = train_df["diabetes"]
features_test = test_df.drop(columns="diabetes")
target_test = test_df["diabetes"]

# Set up ratio
train_data = lightgbm.Dataset(train_df, label=target_train)
valid_data = lightgbm.Dataset(test_df, label=target_test)

# SMOTE
smote = SMOTE(random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train, target_train)

# Define model parameters focusing on high recall
params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 63,
    #'feature_fraction': 0.5,
    #'bagging_fraction': 0.5,
    #'bagging_freq': 20,
    'verbose': -1,
    #'max_depth': 15,
    'learning_rate': 0.02,
    #'objective': 'binary:logistic',
    #'n_estimators': 40
}

# Initialize and train model
# model_lgbm = lightgbm.train(params, train_data, num_boost_round=5000) #, early_stopping_rounds =50)
model = lightgbm.LGBMClassifier(**params)

#y_train_pred = model_lgbm.predict(train_df)
#y_valid_pred = model_lgbm.predict(test_df)
#print("AUC Train: {:.4f}\nAUC Valid:  {:.4f}".format(roc_auc_score(target_train, y_train_pred),
            #                                         roc_auc_score(target_test, y_valid_pred)))

# Cross-validation using the resampled training data
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(model, features_train_resampled, target_train_resampled, cv=cv, scoring="roc_auc")
print(f"Mean ROC AUC from CV: {roc_auc_scores.mean():.4f}")

# Train the model on the resampled training data
model.fit(features_train_resampled, target_train_resampled)

# Evaluate on the test set
y_pred_proba = model.predict_proba(features_test)[:, 1]
roc_auc = roc_auc_score(target_test, y_pred_proba)
y_pred = model.predict(features_test)
print(f"Test ROC AUC: {roc_auc:.4f}")
print("Classification Report:\n", classification_report(target_test, y_pred))

# Display feature importances
importances = model.feature_importances_
jsum = 0
for i, j in enumerate(importances):
    jsum = j+jsum
for i, j in enumerate(importances):
    print(f"Feature {model.feature_name_[i]}: {j/jsum:.4f}")

# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Mean ROC AUC from CV: 0.9972
Test ROC AUC: 0.9046
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       832
           1       0.33      0.62      0.43        16

    accuracy                           0.97       848
   macro avg       0.66      0.80      0.71       848
weighted avg       0.98      0.97      0.97       848

Feature male: 0.0090
Feature age: 0.0950
Feature education: 0.1190
Feature currentSmoker: 0.0282
Feature cigsPerDay: 0.0673
Feature BPMeds: 0.0129
Feature prevalentStroke: 0.0000
Feature prevalentHyp: 0.0100
Feature totChol: 0.1306
Feature sysBP: 0.1082
Feature diaBP: 0.0948
Feature BMI: 0.0823
Feature heartRate: 0.1145
Feature glucose: 0.1177
Feature TenYearCHD: 0.0103

Actual positives:
16

False Positives:
20

True positives:
10

False negatives:
6

True negatives:
812



In [9]:
y_scores = model.predict_proba(features_test)[:, 1]

# Get precision-recall values for different thresholds
precision, recall, thresholds = precision_recall_curve(target_test, y_scores)

# Find the threshold where recall is maximized (this is a naive approach, and in real applications you'd want a balance)
optimal_threshold = thresholds[np.argmax(recall)]

# Classify using the new threshold
y_pred_optimal = np.where(y_scores > optimal_threshold, 1, 0)

print("Classification Report with Optimized Threshold:\n", classification_report(target_test, y_pred_optimal))


# Display comparative results of actual v. false v. true predictions
actual_positives = test_df[target_test == 1]
false_positive_rows = test_df[(target_test == 0) & (y_pred_optimal == 1)]
true_positive_rows = test_df[(target_test == 1) & (y_pred_optimal == 1)]
false_negative_rows = test_df[(target_test == 1) & (y_pred_optimal == 0)]
true_negative_rows = test_df[(target_test == 0) & (y_pred_optimal == 0)]

print(f"\nActual positives:\n{len(actual_positives)}\n")
print(f"False Positives:\n{len(false_positive_rows)}\n")
print(f"True positives:\n{len(true_positive_rows)}\n")
print(f"False negatives:\n{len(false_negative_rows)}\n")
print(f"True negatives:\n{len(true_negative_rows)}\n")

Classification Report with Optimized Threshold:
               precision    recall  f1-score   support

           0       1.00      0.11      0.19       832
           1       0.02      1.00      0.04        16

    accuracy                           0.12       848
   macro avg       0.51      0.55      0.12       848
weighted avg       0.98      0.12      0.19       848


Actual positives:
16

False Positives:
744

True positives:
16

False negatives:
0

True negatives:
88



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
