### Example workflow of XGBoost in python sci-kit learn

XGBoost is widely used in Kaggle

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline  # explain this next time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder

#### Data preparation

In [2]:
os.chdir('C:/Users/Bangda/Desktop/kaggle/titanic')
train = pd.read_csv('train_cleaned.csv')
test  = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,num_family,last_name,title,first_name,imputed_age,adult,ticket_letter_length,ticket_digit_length,re_ticket,family_id
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,2.110213,...,1,Braund,Mr,Owen Harris,22.0,1,1,6,A6,735
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,4.280593,...,1,Cumings,Mrs,John Bradley (Florence Briggs Thayer),38.0,1,1,5,PC5,690
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,2.188856,...,0,Heikkinen,Miss,Laina,26.0,1,2,8,STONO8,446
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,3.990834,...,1,Futrelle,Mrs,Jacques Heath (Lily May Peel),35.0,1,0,6,6,707
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,2.202765,...,0,Allen,Mr,William Henry,35.0,1,0,6,6,758


In [3]:
all_data = [train, test]
for df in all_data:
    df['labeled_sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df[['Miss', 'Mr', 'Mrs', 'other']] = pd.get_dummies(train['title'], drop_first = True)
    df['Master'] = 0

In [4]:
predictors = ['Pclass', 'labeled_sex', 'imputed_age', 'Fare', 'num_family', 'Master', 'Miss', 'Mr', 'Mrs', 'other']
X_train = train[predictors].values
y_train = train['Survived'].values
X_test  = test[predictors].values
X_train.shape, X_test.shape

((891L, 10L), (418L, 10L))

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 233)
X_train.shape, X_valid.shape, X_test.shape

((712L, 10L), (179L, 10L), (418L, 10L))

#### Prepare for XGBoost

In [6]:
import xgboost as xgb



In [7]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dvalid = xgb.DMatrix(X_valid, label = y_valid)
dtest  = xgb.DMatrix(X_test)

#### Set parameters

In [8]:
params = {'n_estimators': 50,
          'max_depth': 4,
          'min_child_weight': 2,
          'gamma': 0.1,
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'objective': 'binary:logistic',
          'eval_metric': 'auc',
          'nthread': 4}

In [9]:
params_lst = params.items()
params_lst

[('n_estimators', 50),
 ('subsample', 0.7),
 ('colsample_bytree', 0.7),
 ('gamma', 0.1),
 ('eval_metric', 'auc'),
 ('objective', 'binary:logistic'),
 ('max_depth', 4),
 ('min_child_weight', 2),
 ('nthread', 4)]

In [10]:
num_round = 50
eval_lst = [(dvalid, 'eval'), (dtrain, 'train')]

In [11]:
bst_default = xgb.train(params_lst, dtrain, num_round, eval_lst)

[0]	eval-auc:0.848566	train-auc:0.85393
[1]	eval-auc:0.860815	train-auc:0.872265
[2]	eval-auc:0.85999	train-auc:0.87363
[3]	eval-auc:0.856309	train-auc:0.890012
[4]	eval-auc:0.850724	train-auc:0.894726
[5]	eval-auc:0.850152	train-auc:0.902999
[6]	eval-auc:0.8492	train-auc:0.90488
[7]	eval-auc:0.845329	train-auc:0.91298
[8]	eval-auc:0.846662	train-auc:0.916468
[9]	eval-auc:0.854532	train-auc:0.918273
[10]	eval-auc:0.855674	train-auc:0.920581
[11]	eval-auc:0.858213	train-auc:0.921871
[12]	eval-auc:0.851993	train-auc:0.924999
[13]	eval-auc:0.856118	train-auc:0.927464
[14]	eval-auc:0.855547	train-auc:0.930089
[15]	eval-auc:0.860244	train-auc:0.932186
[16]	eval-auc:0.860371	train-auc:0.933661
[17]	eval-auc:0.854087	train-auc:0.935302
[18]	eval-auc:0.855103	train-auc:0.93739
[19]	eval-auc:0.857578	train-auc:0.939588
[20]	eval-auc:0.854278	train-auc:0.94158
[21]	eval-auc:0.854532	train-auc:0.942831
[22]	eval-auc:0.856182	train-auc:0.942899
[23]	eval-auc:0.855801	train-auc:0.944247
[24]	eval-a

In [12]:
bst_pred_valid_prob = bst_default.predict(dvalid)
bst_pred_valid = [1. if proba > .5 else 0 for proba in bst_pred_valid_prob]
confusion_matrix(y_valid, bst_pred_valid)

array([[93,  8],
       [18, 60]])

#### Alternative way (sklearn interface)

In [13]:
xgb_default = xgb.XGBClassifier(n_estimators = 50,
                                max_depth = 4, 
                                min_child_weight = 2,
                                gamma = 0.1,
                                subsample = 0.7,
                                colsample_bytree = 0.7, 
                                objective = 'binary:logistic',
                                nthread = 4)
xgb_default.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=2, missing=None, n_estimators=50, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)

In [14]:
bst_pred_valid = xgb_default.predict(X_valid)
confusion_matrix(y_valid, bst_pred_valid)

array([[93,  8],
       [18, 60]])

#### Parameters tuning

In [15]:
xgb_tuned = xgb.XGBClassifier()
cv_params = {'n_estimators': [50, 100],
             'max_depth': [4, 5],
             'min_child_weight': [2],
             'gamma': [0.01, 0.015, 0.02],
             'subsample': [0.6, 0.65, 0.7],
             'colsample_bytree': [0.7],
             'objective': ['binary:logistic'],
             'nthread': [4]}

In [16]:
xgb_cv = GridSearchCV(xgb_tuned, cv_params, cv = 3)

In [17]:
xgb_cv.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 100], 'subsample': [0.6, 0.65, 0.7], 'colsample_bytree': [0.7], 'gamma': [0.01, 0.015, 0.02], 'objective': ['binary:logistic'], 'max_depth': [4, 5], 'min_child_weight': [2], 'nthread': [4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
xgb_cv.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.015,
 'max_depth': 5,
 'min_child_weight': 2,
 'n_estimators': 50,
 'nthread': 4,
 'objective': 'binary:logistic',
 'subsample': 0.65}

In [19]:
xgb_cv_pred_valid = xgb_cv.predict(X_valid)
confusion_matrix(y_valid, xgb_cv_pred_valid)

array([[91, 10],
       [18, 60]])