In [4]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [6]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [None]:
train_file=r'C:\Users\prera\Downloads\rg_train.csv'
test_file=r'C:\Users\prera\Downloads\rg_test.csv'
bd_train=pd.read_csv(train_file)

bd_test=pd.read_csv(test_file)

In [None]:
num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)

In [5]:
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [6]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [7]:
cat_vars=[_ for _ in cat_vars if _ not in 
          ['children','age_band', 'post_code','post_area','family_income']]

In [8]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

In [9]:
p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

In [10]:
p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [11]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [12]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                     columns=data_pipe.get_feature_names())


In [13]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                     columns=data_pipe.get_feature_names())

In [14]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [15]:
x_train.shape

(8124, 71)

In [None]:
gbm_params={'n_estimators':[50,100,200,500,700],
           'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
            'max_depth':[1,2,3,4,5,6],
#             'min_samples_split':[2,5,10,20],
#             'min_samples_leaf':[2,5,10,20],
            'subsample':[0.5,0.8,1],
            'max_features':[5,10,15,20,30,45,55,65]
           }


In [None]:
gbm=GradientBoostingClassifier()

In [None]:
random_search=RandomizedSearchCV(gbm,
                                 scoring='roc_auc',
                                 param_distributions=gbm_params,
                                 cv=10,
                                 n_iter=10,
                                 n_jobs=-1,
                                verbose=20)

In [None]:
random_search.fit(x_train,y_train)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=5,
              max_features=20, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=700,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
              
use the above result in the class, its a result from previous run. This can be definitely different on a rerun. use this to save time in class so that you dont have to wait for the randomised search to finish

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(random_search.cv_results_,5)

top 5 classfiers from the previous run were as follows : 

Model with rank: 1

Mean validation score: 0.925 (std: 0.00188)

Parameters: {'max_features': 20, 'max_depth': 3, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 2

Mean validation score: 0.924 (std: 0.00121)

Parameters: {'max_features': 15, 'max_depth': 4, 'subsample': 1, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 3

Mean validation score: 0.923 (std: 0.00250)

Parameters: {'max_features': 5, 'max_depth': 4, 'subsample': 0.5, 'learning_rate': 0.05, 'n_estimators': 500}

~~~~~~~~~~

Model with rank: 4

Mean validation score: 0.914 (std: 0.00290)

Parameters: {'max_features': 10, 'max_depth': 5, 'subsample': 1, 'learning_rate': 0.05, 'n_estimators': 50}

~~~~~~~~~~

Model with rank: 5

Mean validation score: 0.913 (std: 0.00174)

Parameters: {'max_features': 30, 'max_depth': 5, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 200}

tentative performance : 0.925 for the best classfier 

**Note: you can use the random search predict,predict_proba function to make prediction as randomisedsearchcv automatically fits the best candidate on complete data. If you want to look into feature_importance etc, then fit the best estimator separately**

# Xgboost

In [None]:
xgb_params = {  
                "learning_rate":[0.01,0.05,0.1,0.3,0.5],
                "gamma":[i/10.0 for i in range(0,5)],
                "max_depth": [2,3,4,5,6,7,8],
                "min_child_weight":[1,2,5,10],
                "max_delta_step":[0,1,2,5,10],
                "subsample":[i/10.0 for i in range(5,10)],
                "colsample_bytree":[i/10.0 for i in range(5,10)],
                "colsample_bylevel":[i/10.0 for i in range(5,10)],
                "reg_lambda":[1e-5, 1e-2, 0.1, 1, 100], 
                "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
                "scale_pos_weight":[1,2,3,4,5,6,7,8,9],
                "n_estimators":[100,500,700,1000]
             }


In [None]:
5*5*7*4*5*5*5*5*5*5*9*4

In [None]:
xgb=XGBClassifier(objective='binary:logistic')

In [None]:
n_iter=10

random_search=RandomizedSearchCV(xgb,n_jobs=-1,cv=10,n_iter=n_iter,scoring='roc_auc',
                                 param_distributions=xgb_params)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
report(random_search.cv_results_,5)

top 5 classfiers from the previous run 

Model with rank: 1

Mean validation score: 0.928 (std: 0.00232)

Parameters: {'reg_lambda': 1e-05, 'subsample': 0.9, 'reg_alpha': 1, 'max_depth': 8, 'min_child_weight': 10, 'n_estimators': 1000, 'gamma': 0.1, 'colsample_bylevel': 0.8, 'scale_pos_weight': 2, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_delta_step': 10}

____

Model with rank: 2

Mean validation score: 0.927 (std: 0.00160)

Parameters: {'reg_lambda': 1, 'subsample': 0.6, 'reg_alpha': 0.1, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 1000, 'gamma': 0.3, 'colsample_bylevel': 0.9, 'scale_pos_weight': 2, 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_delta_step': 5}

____

Model with rank: 3

Mean validation score: 0.926 (std: 0.00101)

Parameters: {'reg_lambda': 0.1, 'subsample': 0.7, 'reg_alpha': 1e-05, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 500, 'gamma': 0.2, 'colsample_bylevel': 0.5, 'scale_pos_weight': 3, 'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_delta_step': 1}

____

Model with rank: 4

Mean validation score: 0.925 (std: 0.00104)

Parameters: {'reg_lambda': 0.1, 'subsample': 0.9, 'reg_alpha': 0.01, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 1000, 'gamma': 0.2, 'colsample_bylevel': 0.8, 'scale_pos_weight': 8, 'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_delta_step': 0}

____

Model with rank: 5

Mean validation score: 0.920 (std: 0.00278)

Parameters: {'reg_lambda': 1, 'subsample': 0.8, 'reg_alpha': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 500, 'gamma': 0.0, 'colsample_bylevel': 0.5, 'scale_pos_weight': 8, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_delta_step': 5}

____

tentative performance of best estimator : 0.928


In [None]:
random_search.best_estimator_

best estimator from the previous run can be copied from here :

XGBClassifier(base_score=0.5, colsample_bylevel=0.8, colsample_bytree=0.5,
       gamma=0.1, learning_rate=0.01, max_delta_step=10, max_depth=8,
       min_child_weight=10, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=1, reg_lambda=1e-05,
       scale_pos_weight=2, seed=0, silent=True, subsample=0.9)

**Note: you can use the random search predict,predict_proba function to make prediction as randomisedsearchcv automatically fits the best candidate on complete data. If you want to look into feature_importance etc, then fit the best estimator separately**

## Sequential Parameter tuning for xgboost

If we tune all the paramters together , there are chances that our results will be much far from the best. There are many parameters where variation doesnt impact the performance too much and we can tune them later once we have fixed values of parameters with volatile performance.

As a general strtaegy you can start with tuning numer of trees or n_estimators , in case of boosting machines , learning_rate is directly related with n_estimators . A very low learning_rate will need high number of n_estimators . We can start with a decent fixed learning rate and tune n_estimaors for it. 

All can be left as default for now except subsample , colsample_bytree and colsample_bylevel, these are set to default=1, we'll take a more conservative value 0.8

In [None]:
XGBClassifier?

In [16]:
xgb_params = {  
                "n_estimators":[100,500,700,900,1000,1200,1500]
             }

In [17]:
xgb1=XGBClassifier(subsample=0.8,
                   colsample_bylevel=0.8,
                   colsample_bytree=0.8)

In [18]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid_search=GridSearchCV(xgb1,cv=10,
                         param_grid=xgb_params,
                         scoring='roc_auc',
                         verbose=20)

# two issues : currently xgboost is not running with multicores 
# mac issue:

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
report(grid_search.cv_results_,3)

we got n_estimator=500 as best with learning_rate=0.1  . Next we'll tune max_depth,gamma and min_child_weight, which control overfit by controlling size of individual trees

In [None]:
xgb_params={
            "gamma":[0,2,5,8,10],
            "max_depth": [2,3,4,5,6,7,8],
            "min_child_weight":[0.5,1,2,5,10]
            }

In [None]:
xgb2=XGBClassifier(learning_rate=0.1,n_estimators=500,
                   subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

In [None]:
random_search=RandomizedSearchCV(xgb2,
                                 param_distributions=xgb_params,n_iter=20,cv=5,scoring='roc_auc',
                                 n_jobs=-1,verbose=2)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
report(random_search.cv_results_,3)

we got best values for parameters being tuned as follows : {'min_child_weight': 1, 'gamma': 0, 'max_depth': 3}

Since there is imbalance in the data , we'll look into max_delta_step and scale_pos_weight next

In [None]:
y_train.value_counts()

In [None]:
24720/7841

In [None]:
xgb_params={
            'max_delta_step':[0,1,3,6,10],
            'scale_pos_weight':[1,2,3,4]
            }

In [None]:
xgb3=XGBClassifier(learning_rate=0.1,
                   n_estimators=500,min_child_weight=1,
                   gamma=0,max_depth=3,
                   
                  subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

In [None]:
grid_search=GridSearchCV(xgb3,param_grid=xgb_params,cv=5,scoring='roc_auc',n_jobs=-1,
                         verbose=10)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
report(grid_search.cv_results_,3)

it turns out that , since imbalance was not that severe , defaults come out as best choices {'scale_pos_weight': 1, 'max_delta_step': 0}

Next we check the effect of the noise on data and tune , subsample , colsample_bytree and colsample_bylevel

In [None]:
xgb_params={
            'subsample':[i/10 for i in range(5,11)],
            'colsample_bytree':[i/10 for i in range(5,11)],
            'colsample_bylevel':[i/10 for i in range(5,11)]
            }

In [None]:
xgb4=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0
                  )

In [None]:
random_search=RandomizedSearchCV(xgb4,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',
                                n_jobs=-1,verbose=20)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
report(random_search.cv_results_,3)

bets values that we got for paramaeters are as follows : {'colsample_bylevel': 0.5, 'colsample_bytree': 0.6, 'subsample': 1.0}

lastly we can work on L2 and L1 penalty on leaf node score to further reduce overfit if there is any

In [None]:
xgb5=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0,
                   colsample_bylevel= 0.5, colsample_bytree= 0.6, subsample= 1.0
                  )

In [None]:
xgb_params={
            'reg_lambda':[i/10 for i in range(0,50)],
            'reg_alpha':[i/10 for i in range(0,50)]
            }

In [None]:
random_search=RandomizedSearchCV(xgb5,param_distributions=xgb_params,cv=5,n_iter=20,scoring='roc_auc',
                                n_jobs=-1,verbose=10)

In [None]:
random_search.fit(x_train,y_train)

In [None]:
report(random_search.cv_results_,3)

The best value that we got here is {'reg_lambda': 1.5, 'reg_alpha': 0.0}, but the performance has gone down. May be the default was doing better and wasnt picked as one of the candidates here in the random_search. we'll go with those defaults values instead

In [None]:
xgb6=XGBClassifier(learning_rate=0.1,n_estimators=500,min_child_weight=1,gamma=0,max_depth=3,
                        scale_pos_weight=1,max_delta_step=0,
                   colsample_bylevel= 0.5, colsample_bytree= 0.6, subsample= 1.0,
                  reg_lambda=1,reg_alpha=0)

If we want to simply get cv performance of a model , without having to select any parameters we can make use of cross_validation_score function

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(xgb6,x_train,y_train,scoring='roc_auc',verbose=10,n_jobs=-1,cv=10)

In [None]:
scores=[0.92951477, 0.92590096, 0.93070889, 0.92176974, 0.92882013,
       0.93128318, 0.93018259, 0.93297173, 0.93256565, 0.92947388]
# these are from an earlier iteration , need not match with your current run

In [None]:
import numpy as np

In [None]:
np.mean(scores)

In [None]:
np.std(scores)