# Lab 2: Boosting

In [76]:
#| echo: False
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from xgboost import XGBClassifier

## Data
First we will begin by importing the dataset. For this lab, I decided to choose the 2022 dataset as it had more data.

In [2]:
# Using the 2022 data set without NAN values
# data = pd.read_csv("/Users/ben/Documents/GitHub/AdvancedMachineLearning/Data/HeartDiseaseData/heart_2022_no_nans.csv")
data = pd.read_csv("/Users/Bnkes/Desktop/GitHub/AdvancedMachineLearning/Data/HeartDiseaseData/heart_2022_no_nans.csv")

data

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [22]:
X_full = data.drop("HadHeartAttack", axis = 1)
y_full = data["HadHeartAttack"]

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full)
y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)
y_full = LabelEncoder().fit_transform(y_full)

In [5]:
# Create smaller samples for training the model
random_sample = data.sample(n=50000, random_state=1)
X = random_sample.drop("HadHeartAttack", axis = 1)
y = random_sample["HadHeartAttack"]
y = LabelEncoder().fit_transform(y)

## Adaboost
We will start out by attempting to use adaboost. To choose an estimator for the model we will cross validate decision trees and random forests. While other models can be used, they require an extremely long time to run, making them impractical.

In [6]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

In [70]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME"))
    ]
)

parameters = {
    "adaboost__estimator": [DecisionTreeClassifier(), RandomForestClassifier()]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [13]:
pd.DataFrame(gscv_fitted.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboost__estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.317905,0.09116,0.167678,0.019075,DecisionTreeClassifier(),{'adaboost__estimator': DecisionTreeClassifier()},0.238536,0.238775,0.219747,0.251017,0.211011,0.231817,0.014428,1
1,25.433757,8.777437,0.603862,0.167221,RandomForestClassifier(),{'adaboost__estimator': RandomForestClassifier()},0.19811,0.164248,0.259693,0.207612,0.123093,0.190551,0.045544,2


Examining the results of the grid search, it appears that a Decision Tree is the best option for an Adaboost model for this data. Next we will tune this model to achieve the best possible results.

In [71]:
# Adaboost Decision Tree Tuning
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = DecisionTreeClassifier()))
    ]
)

parameters = {
    "adaboost__estimator__min_samples_split": [2, 5, 10, 25],
    "adaboost__estimator__min_samples_leaf": [1, 5, 10, 25],
    "adaboost__estimator__ccp_alpha": [1e-3, 1e-2, 1e-1]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [29]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(ascending=True, by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboost__estimator__ccp_alpha,param_adaboost__estimator__min_samples_leaf,param_adaboost__estimator__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,70.19726,36.600924,0.675767,0.218348,0.001,25,25,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
14,70.974528,36.826693,0.724599,0.215124,0.001,25,10,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
13,71.722752,37.746225,0.739206,0.144516,0.001,25,5,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
12,71.87359,37.903859,0.724816,0.096655,0.001,25,2,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
11,97.634475,41.179058,0.837864,0.170899,0.001,10,25,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.313397,0.312203,0.287804,0.352057,0.322653,0.317623,0.020724,5


After cross validating, the best parameter model will now be created, trained on a subset of the data, and tested against the whole dataset

In [52]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = DecisionTreeClassifier(ccp_alpha=.001, min_samples_leaf=25, min_samples_split=25)))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_full)

In [53]:
matthews_corrcoef(y_true=y_full, y_pred=y_pred)

0.3061407901453192

In [54]:
cm = confusion_matrix(y_true = y_full, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual No Heart Disease", "Actual Heart Disease"], columns=["Predicted No Heart Disease", "Predicted Heart Disease"])

cm_df

Unnamed: 0,Predicted No Heart Disease,Predicted Heart Disease
Actual No Heart Disease,230181,2406
Actual Heart Disease,10709,2726


As can be seen above, the model does a good job accurately classifying those without heart disease, but it struggles to classify those with heart disease. To try to fix this, we will attempt to use a differnt model.

## XGBoost

In [13]:
# XGBoost tuning pt 1
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__eta": [.1, .5, 1],
    "xgb__gamma": [0, 1, 5, 25],
    "xgb__lambda": [.1, 1, 5, 10],
    "xgb__alpha": [.1, .5, 1, 5]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [14]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(ascending=True, by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__alpha,param_xgb__eta,param_xgb__gamma,param_xgb__lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
103,2.49624,0.054193,0.187833,0.006875,1.0,0.1,1,10,"{'xgb__alpha': 1, 'xgb__eta': 0.1, 'xgb__gamma...",0.347398,0.358871,0.361606,0.383992,0.338919,0.358157,0.015269,1
3,2.311407,0.034894,0.183933,0.009919,0.1,0.1,0,10,"{'xgb__alpha': 0.1, 'xgb__eta': 0.1, 'xgb__gam...",0.356544,0.344179,0.338395,0.399421,0.351871,0.358082,0.021591,2
99,2.542748,0.075849,0.196434,0.018675,1.0,0.1,0,10,"{'xgb__alpha': 1, 'xgb__eta': 0.1, 'xgb__gamma...",0.344802,0.355954,0.348274,0.397533,0.336864,0.356685,0.021326,3
55,2.301405,0.01571,0.173531,0.002367,0.5,0.1,1,10,"{'xgb__alpha': 0.5, 'xgb__eta': 0.1, 'xgb__gam...",0.34511,0.360793,0.34511,0.387987,0.342931,0.356386,0.017049,4
95,1.729504,0.020298,0.155327,0.015276,0.5,1.0,25,10,"{'xgb__alpha': 0.5, 'xgb__eta': 1, 'xgb__gamma...",0.373499,0.365119,0.352023,0.336027,0.354074,0.356148,0.01271,5


After examing the results of the first cross validation search, I have decided to do a second round of cross validation, narrowing down the parameter values to try.

In [15]:
# XGBoost tuning pt 2
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__eta": [.1, .2, .3],
    "xgb__gamma": [0, 1, 2, 5],
    "xgb__lambda": [10, 15, 25],
    "xgb__alpha": [.5, .75, 1, 2]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [16]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(ascending=True, by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__alpha,param_xgb__eta,param_xgb__gamma,param_xgb__lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
84,2.376618,0.047703,0.183032,0.009609,1.0,0.2,0,10,"{'xgb__alpha': 1, 'xgb__eta': 0.2, 'xgb__gamma...",0.353488,0.360159,0.365649,0.389908,0.334713,0.360783,0.017918,1
63,1.968146,0.052588,0.160628,0.008824,0.75,0.3,1,10,"{'xgb__alpha': 0.75, 'xgb__eta': 0.3, 'xgb__ga...",0.373119,0.365671,0.338476,0.3826,0.342036,0.36038,0.017322,2
76,2.391921,0.046544,0.190834,0.0174,1.0,0.1,1,15,"{'xgb__alpha': 1, 'xgb__eta': 0.1, 'xgb__gamma...",0.349155,0.347538,0.365119,0.387128,0.346093,0.359007,0.015643,3
36,2.543148,0.083491,0.16823,0.005768,0.75,0.1,0,10,"{'xgb__alpha': 0.75, 'xgb__eta': 0.1, 'xgb__ga...",0.358078,0.358968,0.352733,0.380169,0.344927,0.358975,0.011716,4
101,1.889033,0.019305,0.157228,0.007161,1.0,0.3,1,25,"{'xgb__alpha': 1, 'xgb__eta': 0.3, 'xgb__gamma...",0.341077,0.371057,0.359287,0.383382,0.337266,0.358414,0.017503,5


In [26]:
# Final XGBoost Model
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier(alpha=1, eta=.2, gamma=0, reg_lambda=10, n_jobs = -1))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_full)

In [27]:
matthews_corrcoef(y_true=y_full, y_pred=y_pred)

0.42795640127393625

In [28]:
cm = confusion_matrix(y_true = y_full, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual No Heart Disease", "Actual Heart Disease"], columns=["Predicted No Heart Disease", "Predicted Heart Disease"])

cm_df

Unnamed: 0,Predicted No Heart Disease,Predicted Heart Disease
Actual No Heart Disease,230741,1846
Actual Heart Disease,9493,3942


## Random Forest
To compare the xgboost and adaboost models, I also am including a third model. I am choosing to use a random forest due to its ability to classify imbalanced datasets accurately

In [29]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier())
    ]
)

parameters = {
    "forest__n_estimators": [25, 50, 100, 200],
    "forest__min_samples_leaf": [1, 2, 3, 4, 5, 10, 25],
    "forest__min_samples_split": [2, 3, 4, 5, 10, 25],
    "forest__ccp_alpha": [1, 0, 1e-1, 1e-2, 1e-3]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 840 candidates, totalling 4200 fits


In [63]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(ascending=True, by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__ccp_alpha,param_forest__min_samples_leaf,param_forest__min_samples_split,param_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
172,2.506241,0.04335,0.179632,0.00853,0,1,3,25,"{'forest__ccp_alpha': 0, 'forest__min_samples_...",0.247156,0.257783,0.261444,0.239788,0.232069,0.247648,0.010934,1
177,5.307034,0.246975,0.259646,0.016032,0,1,4,50,"{'forest__ccp_alpha': 0, 'forest__min_samples_...",0.254484,0.279456,0.230255,0.252378,0.200277,0.24337,0.026593,2
176,2.924615,0.061963,0.214838,0.015721,0,1,4,25,"{'forest__ccp_alpha': 0, 'forest__min_samples_...",0.249141,0.212262,0.261633,0.254484,0.235965,0.242697,0.017382,3
208,2.374518,0.011206,0.177531,0.006732,0,2,10,25,"{'forest__ccp_alpha': 0, 'forest__min_samples_...",0.269961,0.233483,0.205093,0.253267,0.246993,0.241759,0.021763,4
168,2.692074,0.029993,0.179932,0.005045,0,1,2,25,"{'forest__ccp_alpha': 0, 'forest__min_samples_...",0.20409,0.23568,0.230391,0.26625,0.211957,0.229674,0.021646,5


In [31]:
# Final Random Forest Model
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(ccp_alpha=0, min_samples_leaf=1, min_samples_split=3, n_estimators = 25, n_jobs=-1))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_full)

In [32]:
matthews_corrcoef(y_true=y_full, y_pred=y_pred)

0.7987703543400756

In [33]:
cm = confusion_matrix(y_true = y_full, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual No Heart Disease", "Actual Heart Disease"], columns=["Predicted No Heart Disease", "Predicted Heart Disease"])

cm_df

Unnamed: 0,Predicted No Heart Disease,Predicted Heart Disease
Actual No Heart Disease,232238,349
Actual Heart Disease,4341,9094


In [69]:
importance = fitted_pipeline.named_steps["forest"].feature_importances_
feature_names = fitted_pipeline.named_steps["preprocessing"].named_transformers_["dummify"].get_feature_names_out()

feature_importance = zip(feature_names, importance)
feature_importance = pd.DataFrame(feature_importance)
feature_importance.columns = ["Feature Name", "Feature Importance"]

feature_importance.sort_values(ascending=False, by = "Feature Importance").head(5)

Unnamed: 0,Feature Name,Feature Importance
65,HadAngina_Yes,0.156087
66,HadStroke_Yes,0.017082
88,ChestScan_Yes,0.014861
74,HadDiabetes_Yes,0.012541
72,HadArthritis_Yes,0.011692


## Model Comparison

After completing all three models, I was surprised to find that the Random Forest model performed the best. This is good for multiple reasons, Random Forest models are interpretable, something that is not possible with boosting models. Also, the Random Forest required significantly less time to run. This allowed for further tuning and better results. The Random Forest model did predict a large group of people with heart disease not having it, which would decrease its value, but did not produce many false positives. Examining the top five features from the model, we see that Angina, Stroke, Chest Scan, Diabetes, and Arthritis are all very important features for predicting heart disease. This makes a large amount of sense as Angina (chest pain), Stroke, Diabetes, and Arthritis are all linked with heart disease in medical literature. 