# XGBoost model

In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
import xgboost; print(xgboost.__version__)

1.5.1


In [3]:
# Importing the input dataset
heart_df = pd.read_csv('Resources/heart_undersampled.csv')
print(heart_df.shape)
heart_df.head()

(54746, 54)


Unnamed: 0,HeartDisease,PhysicalHealth,MentalHealth,SleepTime,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,Stroke_No,Stroke_Yes,...,Race_White,Diabetic_No,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,0,0,0,7,0,1,0,1,1,0,...,1,1,0,0,0,1,0,0,0,0
1,0,0,0,8,1,0,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0
2,0,21,0,6,1,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
3,0,2,2,6,0,1,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,0,6,0,1,1,0,1,0,...,1,1,0,0,0,0,0,1,0,0


In [4]:
# Removing HeartDisease target from features data
y = heart_df.loc[:,"HeartDisease"]
X = heart_df.drop(columns="HeartDisease")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
print('Training Features Shape:', X_train.shape)
print('Training Target Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Target Shape:', y_test.shape)

Training Features Shape: (41059, 53)
Training Target Shape: (41059,)
Testing Features Shape: (13687, 53)
Testing Target Shape: (13687,)


In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(41059, 53)
(13687, 53)


### Train the XGBoost Model

In [7]:
# import XGBoost classifier
from xgboost import XGBClassifier

In [8]:
# fit model
model = XGBClassifier()
model.fit(X_train_scaled, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Make Predictions with XGBoost Model

In [9]:
# make predictions for test data
y_pred = model.predict(X_test_scaled)
predictions = [round(value) for value in y_pred]

### Evaluate Predictions

In [10]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
acc_score = balanced_accuracy_score(y_test, y_pred)

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)

In [12]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual True", "Actual False"],
    columns=["Predicted True", "Predicted False"])

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)

# Displaying calculated accuracy score
print("Accuracy Score: %.2f%%" % (acc_score * 100.0))

# Print the imbalanced classification report
print("Classification Report")
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted True,Predicted False
Actual True,4947,1896
Actual False,1413,5431


Accuracy Score: 75.82%
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.72      0.75      6843
           1       0.74      0.79      0.77      6844

    accuracy                           0.76     13687
   macro avg       0.76      0.76      0.76     13687
weighted avg       0.76      0.76      0.76     13687

