## Baseline XGBoost Model

Tree-based models, specifically gradient boosted decision trees, are generally considered the gold-standard for working with tabular data. While the primary aim of the project is bias mitigation applied to deep-learning it would be dissmisive to not consider model performance against such models for a baseline comparison.

Gradient boosting methods inherit values of input features and then execute countless tree models to halt the loss function. It does this by assimilating weak models, then incrementally and iteratively models weighing data, diligently accompanied by an election of a weak model with the best performance.

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import utilities
import global_variables as gv

from sklearn import metrics
from sklearn.preprocessing import QuantileTransformer, RobustScaler, StandardScaler,MinMaxScaler

### load in data and save DMatrix into a XGBoost binary file

In [3]:
# first retrieve into pandas dataframe
# alternatively:
# df = pd.read_csv('CVD_data.csv')
df = pd.read_csv(gv.data_link)
pd.set_option('display.max_columns', None)
df.drop('Unnamed: 0', axis=1, inplace=True)

### preprocess input features

In [4]:
X_train, X_test, y_train, y_test = utilities.process_features(df, gv.outcomes[-1], StandardScaler(), one_hot=True, val=False)
X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

### build & save model

In [None]:
for outcome in gv.outcomes:

    X_train, X_test, y_train, y_test = utilities.process_features(df, outcome, StandardScaler(), one_hot=True, val=False)
    X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

    clf=xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=800, objective='binary:logistic', booster='gbtree')

    #Printing all the parameters of XGBoost
    print(clf)

    #Creating the model on Training Data
    XGB=clf.fit(X_train,y_train)
    prediction=XGB.predict(X_test)

    #Measuring accuracy on Testing Data
    print(metrics.classification_report(y_test, prediction))
    print(metrics.confusion_matrix(y_test, prediction))

    #Plotting the feature importance for Top 10 most important columns
    %matplotlib inline
    feature_importances = pd.Series(XGB.feature_importances_, index=X_train.columns.to_list())
    feature_importances.nlargest(10).plot(kind='barh')
    plt.title(outcome)

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=800, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)
              precision    recall  f1-score   support

           0       0.83      0.62      0.71     12898
           1       0.37      0.62      0.46      4493

    accuracy                           0.62     17391
   macro avg       0.60      0.62      0

In [None]:
# another version

xgbc = XGBClassifier(verbose=False).fit(xTrain,yTrain)
predict = xgbc.predict(xTest)
print("==============Results from XGB Classifier before tuning==============")
print("Accuracy Score: ", accuracy_score(yTest,predict))
R2CV = cross_val_score(xgbc,xTest,yTest,cv=10).mean()
print("Cross Validation Score: ", R2CV)
# 0.84
error = mean_squared_error(yTest,predict)
print("Root Mean Squared Error: ", np.sqrt(error))


xgbctuned = XGBClassifier(learning_rate=0.01, max_depth=6, min_samples_split=2,
                              n_estimators=100, subsample=0.8).fit(xTrain, yTrain)
print("==============Results from XGB Classifier after tuning==============")
predicttuned = xgbctuned.predict(xTest)
print("Accuracy Score: ", accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(xgbctuned,xTest,yTest,cv=10).mean()
print("Cross Validation Score: ", R2CVtuned)
# 0.82
errortuned = mean_squared_error(yTest,predicttuned)
print("Root Mean Squared Error: ", np.sqrt(errortuned))