# Lab 2: Boosting

In [38]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from xgboost import XGBClassifier

## Data

In [3]:
# Using the 2022 data set without NAN values
data = pd.read_csv("/Users/ben/Documents/GitHub/AdvancedMachineLearning/Data/HeartDiseaseData/heart_2022_no_nans.csv")

data

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,Former smoker,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 65 to 69,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,No,No,No,No,No,No,No,Yes,Yes,No,No,No,No,No,No,Former smoker,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 70 to 74,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,No,No,No,No,No,No,No,Yes,No,No,Yes,No,Yes,No,No,Former smoker,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 75 to 79,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,No,No,No,Yes,No,Yes,No,Yes,No,No,No,No,Yes,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 60 to 64,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"Black only, Non-Hispanic",Age 25 to 29,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,No,Yes,No,No,No,No,No,Yes,Yes,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"Multiracial, Non-Hispanic",Age 65 to 69,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"Black only, Non-Hispanic",Age 50 to 54,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [32]:
X_full = data.drop("HadHeartAttack", axis = 1)
y_full = data["HadHeartAttack"]

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full)

In [49]:
# Create smaller samples for training the model
random_sample = data.sample(n=50000, random_state=1)
X = random_sample.drop("HadHeartAttack", axis = 1)
y = random_sample["HadHeartAttack"]
y = LabelEncoder().fit_transform(y)

## Adaboost
We will start out by attempting to use adaboost. To choose an estimator for the model we will cross validate decision trees and random forests. While other models can be used, they require an extremely long time to run, making them impractical.

In [7]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

In [12]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME"))
    ]
)

parameters = {
    "adaboost__estimator": [DecisionTreeClassifier(), RandomForestClassifier()]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [13]:
pd.DataFrame(gscv_fitted.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboost__estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.317905,0.09116,0.167678,0.019075,DecisionTreeClassifier(),{'adaboost__estimator': DecisionTreeClassifier()},0.238536,0.238775,0.219747,0.251017,0.211011,0.231817,0.014428,1
1,25.433757,8.777437,0.603862,0.167221,RandomForestClassifier(),{'adaboost__estimator': RandomForestClassifier()},0.19811,0.164248,0.259693,0.207612,0.123093,0.190551,0.045544,2


Examining the results of the grid search, it appears that a Decision Tree is the best option for an Adaboost model for this data. Next we will tune this model to achieve the best possible results.

In [21]:
# Adaboost Decision Tree Tuning
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = DecisionTreeClassifier()))
    ]
)

parameters = {
    "adaboost__estimator__min_samples_split": [2, 5, 10, 25],
    "adaboost__estimator__min_samples_leaf": [1, 5, 10, 25],
    "adaboost__estimator__ccp_alpha": [1e-3, 1e-2, 1e-1]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [29]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(ascending=True, by = "rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboost__estimator__ccp_alpha,param_adaboost__estimator__min_samples_leaf,param_adaboost__estimator__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
15,70.19726,36.600924,0.675767,0.218348,0.001,25,25,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
14,70.974528,36.826693,0.724599,0.215124,0.001,25,10,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
13,71.722752,37.746225,0.739206,0.144516,0.001,25,5,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
12,71.87359,37.903859,0.724816,0.096655,0.001,25,2,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.31159,0.326752,0.328092,0.289217,0.36877,0.324884,0.026033,1
11,97.634475,41.179058,0.837864,0.170899,0.001,10,25,"{'adaboost__estimator__ccp_alpha': 0.001, 'ada...",0.313397,0.312203,0.287804,0.352057,0.322653,0.317623,0.020724,5


After cross validating, the best parameter model will now be created, trained on a subset of the data, and tested against the whole dataset

In [52]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = DecisionTreeClassifier(ccp_alpha=.001, min_samples_leaf=25, min_samples_split=25)))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X_full)

In [53]:
matthews_corrcoef(y_true=y_full, y_pred=y_pred)

0.3061407901453192

In [54]:
cm = confusion_matrix(y_true = y_full, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual No Heart Disease", "Actual Heart Disease"], columns=["Predicted No Heart Disease", "Predicted Heart Disease"])

cm_df

Unnamed: 0,Predicted No Heart Disease,Predicted Heart Disease
Actual No Heart Disease,230181,2406
Actual Heart Disease,10709,2726


As can be seen above, the model does a good job accurately classifying those without heart disease, but it struggles to classify those with heart disease. To try to fix this, we will attempt to use a differnt model.

## XGBoost

In [None]:
# XGBoost tuning
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__eta": [.1, .2, .3, .4, .5, .6, .7, .8, .9],
    "xgb__gamma": [0, 1, 2, 5, 10, 15, 25],
    "xgb__lambda": [.1, .5, 1, 5, 10],
    "xgb__alpha": [0, .1, .5, 1, 10],
    "xgb__tree_method": ["auto", "exact", "approx", "hist"]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef', n_jobs=-1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

