In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

In [30]:
df = pd.read_csv("combined_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,CHCKDNY2,_DRDXAR2,SDHBILLS,SDHSTRE1,_SMOKER3,SDLONELY,SDHFOOD1,_CASTHM1,SDHEMPLY,...,_TOTINDA,_CHLDCNT,_RFBING6,MARITAL,_INCOMG1,EMTSUPRT,_SEX,_EDUCAG,ADDEPEV3,year
0,0,2.0,2.0,2.0,5.0,4.0,5.0,5.0,1.0,2.0,...,2.0,1.0,1.0,1.0,9.0,1.0,2.0,3.0,2.0,2023
1,1,2.0,1.0,2.0,5.0,4.0,3.0,5.0,1.0,2.0,...,1.0,1.0,1.0,2.0,9.0,2.0,2.0,3.0,1.0,2023
2,2,2.0,1.0,1.0,3.0,3.0,3.0,5.0,2.0,2.0,...,1.0,1.0,1.0,3.0,1.0,4.0,2.0,2.0,2.0,2023
3,3,2.0,1.0,2.0,5.0,4.0,3.0,5.0,1.0,2.0,...,1.0,1.0,1.0,1.0,9.0,1.0,2.0,3.0,1.0,2023
4,4,2.0,1.0,2.0,2.0,4.0,2.0,4.0,1.0,2.0,...,1.0,1.0,1.0,3.0,5.0,2.0,2.0,3.0,1.0,2023


In [31]:
df_clean = df.copy()

df_clean = df_clean.replace([9, 7], float('nan'))

df_clean = df_clean.dropna()

print(f"Original shape: {df.shape}")
print(f"Shape after cleaning: {df_clean.shape}")

df_clean.head()

Original shape: (878455, 28)
Shape after cleaning: (191010, 28)


Unnamed: 0.1,Unnamed: 0,CHCKDNY2,_DRDXAR2,SDHBILLS,SDHSTRE1,_SMOKER3,SDLONELY,SDHFOOD1,_CASTHM1,SDHEMPLY,...,_TOTINDA,_CHLDCNT,_RFBING6,MARITAL,_INCOMG1,EMTSUPRT,_SEX,_EDUCAG,ADDEPEV3,year
4,4.0,2.0,1.0,2.0,2.0,4.0,2.0,4.0,1.0,2.0,...,1.0,1.0,1.0,3.0,5.0,2.0,2.0,3.0,1.0,2023
6,6.0,2.0,1.0,2.0,5.0,3.0,4.0,5.0,1.0,2.0,...,2.0,1.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2023
10,10.0,2.0,1.0,2.0,4.0,4.0,4.0,4.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,3.0,2.0,2023
15,15.0,2.0,1.0,2.0,5.0,4.0,3.0,5.0,1.0,2.0,...,1.0,1.0,1.0,3.0,2.0,1.0,2.0,2.0,2.0,2023
32,32.0,2.0,1.0,2.0,4.0,3.0,3.0,3.0,1.0,2.0,...,1.0,1.0,1.0,2.0,3.0,1.0,2.0,4.0,2.0,2023


In [32]:
df_clean = df_clean[df_clean['year'] != 2022]

In [33]:
df_clean.drop(columns=['Unnamed: 0', 'year'], inplace=True)

In [34]:
df_clean['ADDEPEV3'] = df_clean['ADDEPEV3'].replace(2.0, 0.0)

In [35]:
X = df_clean.drop('ADDEPEV3', axis=1)
y = df_clean['ADDEPEV3']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

## SMOTE

In [37]:
smote = SMOTE(sampling_strategy="auto", random_state=42)

In [38]:
df_clean.shape

(93726, 26)

In [39]:
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

## GradientBoostingClassifier

In [41]:
gb_model = GradientBoostingClassifier(random_state=42)

In [43]:
gb_model.fit(X_train_balanced, y_train_balanced)

In [44]:
y_pred_gb = gb_model.predict(X_test)
y_pred_proba_gb = gb_model.predict_proba(X_test)[:, 1]

In [45]:
gb_accuracy = accuracy_score(y_test, y_pred_gb)
roc_auc = roc_auc_score(y_test, y_pred_proba_gb)

In [46]:
print(f"\nGradient Boosting with SMOTE results:")
print(f"Accuracy: {gb_accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))


Gradient Boosting with SMOTE results:
Accuracy: 0.8055
ROC AUC: 0.8187

Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.92      0.88     17683
         1.0       0.65      0.44      0.53      5749

    accuracy                           0.81     23432
   macro avg       0.74      0.68      0.70     23432
weighted avg       0.79      0.81      0.79     23432



## HistGradientBoostingClassifier

In [49]:
hgb_model = HistGradientBoostingClassifier(random_state=42, class_weight='balanced')

hgb_model.fit(X_train_balanced, y_train_balanced)

y_pred_hgb = hgb_model.predict(X_test)
y_pred_proba_hgb = hgb_model.predict_proba(X_test)[:, 1]

In [50]:
hgb_accuracy = accuracy_score(y_test, y_pred_hgb)
hgb_roc_auc = roc_auc_score(y_test, y_pred_proba_hgb)

print(f"\nHistogram-based Gradient Boosting with SMOTE results:")
print(f"Accuracy: {hgb_accuracy:.4f}")
print(f"ROC AUC: {hgb_roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_hgb))


Histogram-based Gradient Boosting with SMOTE results:
Accuracy: 0.8079
ROC AUC: 0.8204

Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.93      0.88     17683
         1.0       0.67      0.42      0.52      5749

    accuracy                           0.81     23432
   macro avg       0.75      0.68      0.70     23432
weighted avg       0.79      0.81      0.79     23432



## RandomForest + Weighted Class

In [52]:
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

In [53]:
y_pred = rf_model.predict(X_test)

In [55]:
rf_accuracy = accuracy_score(y_test, y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

print(f"\nRandom Forest with SMOTE results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC AUC: {rf_roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Random Forest with SMOTE results:
Accuracy: 0.8006
ROC AUC: 0.8013

Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.94      0.88     17683
         1.0       0.67      0.37      0.47      5749

    accuracy                           0.80     23432
   macro avg       0.75      0.65      0.68     23432
weighted avg       0.78      0.80      0.78     23432



## Random Forest + SMOTE

In [56]:
rfs_model = RandomForestClassifier(random_state=42)
rfs_model.fit(X_train_balanced, y_train_balanced)
y_pred_rfs = rfs_model.predict(X_test)

In [57]:
rfs_accuracy = accuracy_score(y_test, y_pred_rfs)
rfs_roc_auc = roc_auc_score(y_test, rfs_model.predict_proba(X_test)[:, 1])

print(f"\nRandom Forest with SMOTE results:")
print(f"Accuracy: {rfs_accuracy:.4f}")
print(f"ROC AUC: {rfs_roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rfs))


Random Forest with SMOTE results:
Accuracy: 0.7986
ROC AUC: 0.8000

Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87     17683
         1.0       0.64      0.41      0.50      5749

    accuracy                           0.80     23432
   macro avg       0.73      0.67      0.69     23432
weighted avg       0.78      0.80      0.78     23432



In [58]:
#Summary of results
results_summary = pd.DataFrame({
    'Model': ['Gradient Boosting', 'Histogram-based Gradient Boosting', 'Random Forest', 'Random Forest with SMOTE'],
    'Accuracy': [gb_accuracy, hgb_accuracy, rf_accuracy, rfs_accuracy],
    'ROC AUC': [roc_auc, hgb_roc_auc, rf_roc_auc, rfs_roc_auc]
})

print("\nSummary of Results:")
print(results_summary)


Summary of Results:
                               Model  Accuracy   ROC AUC
0                  Gradient Boosting  0.805480  0.818740
1  Histogram-based Gradient Boosting  0.807870  0.820443
2                      Random Forest  0.800615  0.801302
3           Random Forest with SMOTE  0.798566  0.800042


In [None]:
gb_space = [
    Integer(50, 300, name='n_estimators'),
    Real(0.01, 0.3, name='learning_rate'),
    Integer(3, 10, name='max_depth'),
    Real(0.1, 1.0, name='subsample'),
    Integer(2, 10, name='min_samples_split'),
    Integer(2, 10, name='min_samples_leaf')
]

hgb_space = [
    Integer(50, 300, name='max_iter'),
    Real(0.01, 0.3, name='learning_rate'),
    Integer(3, 10, name='max_depth'),
    Real(0.1, 1.0, name='subsample'),
    Integer(1, 20, name='min_samples_leaf')
]

rf_space = [
    Integer(50, 300, name='n_estimators'),
    Integer(3, 15, name='max_depth'),
    Integer(2, 20, name='min_samples_split'),
    Integer(1, 10, name='min_samples_leaf'),
    Real(0.1, 1.0, name='max_samples')
]

In [62]:
# Gradient Boosting optimization
@use_named_args(gb_space)
def gb_objective(**params):
    model = GradientBoostingClassifier(random_state=42, **params)
    return -np.mean(cross_val_score(model, X_train_balanced, y_train_balanced, cv=3, scoring='roc_auc'))

print("Optimizing Gradient Boosting...")
gb_result = gp_minimize(gb_objective, gb_space, n_calls=50, random_state=42)

# Histogram-based Gradient Boosting optimization
@use_named_args(hgb_space)
def hgb_objective(**params):
    model = HistGradientBoostingClassifier(random_state=42, class_weight='balanced', **params)
    return -np.mean(cross_val_score(model, X_train_balanced, y_train_balanced, cv=3, scoring='roc_auc'))

print("Optimizing Histogram-based Gradient Boosting...")
hgb_result = gp_minimize(hgb_objective, hgb_space, n_calls=50, random_state=42)

# Random Forest optimization
@use_named_args(rf_space)
def rf_objective(**params):
    model = RandomForestClassifier(random_state=42, class_weight='balanced', **params)
    return -np.mean(cross_val_score(model, X_train_balanced, y_train_balanced, cv=3, scoring='roc_auc'))

Optimizing Gradient Boosting...


ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yusra\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yusra\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\yusra\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\yusra\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'min_samples_split' parameter of GradientBoostingClassifier must be an int in the range [2, inf) or a float in the range (0.0, 1.0]. Got np.int64(1) instead.


In [None]:


print("Optimizing Random Forest...")
rf_result = gp_minimize(rf_objective, rf_space, n_calls=50, random_state=42)

best_gb_params = dict(zip([dim.name for dim in gb_space], gb_result.x))
best_hgb_params = dict(zip([dim.name for dim in hgb_space], hgb_result.x))
best_rf_params = dict(zip([dim.name for dim in rf_space], rf_result.x))

print("\nBest Parameters:")
print(f"Gradient Boosting: {best_gb_params}")
print(f"Histogram-based GB: {best_hgb_params}")
print(f"Random Forest: {best_rf_params}")