# XGBoost

- Extreme Gradient Boosting
- Objective function made up of training loss and regularization
    - Training loss measures how predictive our model is on training data (MSE or Logistic loss for regression and classification respectively)
    - Regularization term controls the complexity of the model, which helps in avoiding overfitting
- Usually has better performance than gradient boosting

In [6]:
import xgboost as xgb
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
X = df[['pclass', 'sex', 'age']].copy()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(df['sex'])
y = df['survived'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [11]:
xgb_clf = xgb.XGBClassifier(
    max_depth=5, n_estimators=10000, learning_rate=0.3, n_jobs=-1
)
xgb_clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10000, n_jobs=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [15]:
printScore(xgb_clf, X_train, X_test, y_train, y_test)
printScore(xgb_clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.94

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.87      0.91        55
           1       0.94      0.97      0.95       108

    accuracy                           0.94       163
   macro avg       0.94      0.92      0.93       163
weighted avg       0.94      0.94      0.94       163
 

Confusion Matrix: 
 [[ 48   7]
 [  3 105]] 

ROC AUC: 0.9225

Test Results:

Accuracy: 0.74

Classification Report: 
               precision    recall  f1-score   support

           0       0.40      0.50      0.44         4
           1       0.86      0.80      0.83        15

    accuracy                           0.74        19
   macro avg       0.63      0.65      0.64        19
weighted avg       0.76      0.74      0.75        19
 

Confusion Matrix: 
 [[ 2  2]
 [ 3 12]] 

ROC AUC: 0.6500

