# XGB TUNING

In [1]:
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

X_train = pd.read_csv('train_features.csv')
X_test = pd.read_csv('test_features.csv')
y_train = pd.read_csv('train_labels.csv')['charged_off']
sample_submission = pd.read_csv('sample_submission.csv')

X_train.shape, X_test.shape, y_train.shape

((37745, 103), (9437, 103), (37745,))

In [2]:
def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_test  = wrangle(X_test)
X_train.shape, X_test.shape

((37745, 98), (9437, 98))

In [3]:
def make_predictions(pipeline, X_train, y_train, X_test, name):
    pipeline.fit(X_train, y_train)
    sample_submission = pd.read_csv('sample_submission.csv') 
    submission = sample_submission.copy() 
    submission['charged_off'] = pipeline.predict_proba(X_test)[:, 1] 
    submission.to_csv( name + '.csv', index=False)

In [4]:
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [34]:
xgb_pipe = make_pipeline(
    ce.BinaryEncoder(), 
    XGBClassifier(
        learning_rate = 0.1, 
        min_child_weight = 5,
        max_depth = 4,
        gamma=0.2,
        max_delta_step = 0,
        subsample = 1,
        colsample_bytree = 0.4,
        colsample_bylevel = 1,
        n_estimators=200,
        #lambda = 1,
        alpha = 0,
        scale_pos_weight = 1,
        eval_metric = "auc",
        xgbclassifier__silent = False,
        n_jobs=-1)
)

cross_val_score(xgb_pipe, X_train, y_train, cv=5, scoring='roc_auc', verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ....................... , score=0.7501976940977215, total=   3.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV] ....................... , score=0.7524407147145283, total=   2.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.5s remaining:    0.0s


[CV] ........................ , score=0.752173120387528, total=   2.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.2s remaining:    0.0s


[CV] ....................... , score=0.7499339402719396, total=   3.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.2s remaining:    0.0s


[CV] ....................... , score=0.7536110201534246, total=   2.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   15.1s finished


array([0.75019769, 0.75244071, 0.75217312, 0.74993394, 0.75361102])

In [35]:
make_predictions(xgb_pipe, X_train, y_train, X_test, "xgb2")

In [28]:
from sklearn.model_selection import GridSearchCV
sorted(xgb_pipe.get_params().keys())


['binaryencoder',
 'binaryencoder__cols',
 'binaryencoder__drop_invariant',
 'binaryencoder__handle_unknown',
 'binaryencoder__impute_missing',
 'binaryencoder__return_df',
 'binaryencoder__verbose',
 'memory',
 'steps',
 'xgbclassifier',
 'xgbclassifier__alpha',
 'xgbclassifier__base_score',
 'xgbclassifier__booster',
 'xgbclassifier__colsample_bylevel',
 'xgbclassifier__colsample_bytree',
 'xgbclassifier__eval_metric',
 'xgbclassifier__gamma',
 'xgbclassifier__learning_rate',
 'xgbclassifier__max_delta_step',
 'xgbclassifier__max_depth',
 'xgbclassifier__min_child_weight',
 'xgbclassifier__missing',
 'xgbclassifier__n_estimators',
 'xgbclassifier__n_jobs',
 'xgbclassifier__nthread',
 'xgbclassifier__objective',
 'xgbclassifier__random_state',
 'xgbclassifier__reg_alpha',
 'xgbclassifier__reg_lambda',
 'xgbclassifier__scale_pos_weight',
 'xgbclassifier__seed',
 'xgbclassifier__silent',
 'xgbclassifier__subsample',
 'xgbclassifier__xgbclassifier__silent']

In [29]:
param_grid = {
    "xgbclassifier__max_depth"        : [4,],
    "xgbclassifier__min_child_weight" : [5],
    "xgbclassifier__gamma"            : [0.2,],
    "xgbclassifier__colsample_bytree" : [0.4,],
}

In [30]:
gs = GridSearchCV(
                xgb_pipe,
                param_grid=param_grid, cv=3, 
                scoring='roc_auc',
                verbose= 10)

In [31]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7483232960038064, total=   1.7s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7386890187566431, total=   1.8s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7457447499475107, total=   2.3s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.9s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7476974472738296, total=   2.1s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.3s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7397514151943257, total=   2.3s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.1s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7468069198131233, total=   1.9s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   14.3s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7487675741322299, total=   1.9s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   16.6s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7395814106175701, total=   1.8s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   18.8s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7467157623168353, total=   1.9s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   21.0s remaining:    0.0s


[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1, score=0.7533869194294481, total=   2.2s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1 
[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1, score=0.740993936427905, total=   2.2s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1 
[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=1, score=0.7464295010933361, total=   2.0s
[CV] xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_depth=4, xgbclassifier__min_child_weight=3 
[CV]  xgbclassifier__colsample_bytree=0.3, xgbclassifier__gamma=0.0, xgbclassifier__max_dept

[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 50.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('binaryencoder', BinaryEncoder(cols=None, drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, verbose=0)), ('xgbclassifier', XGBClassifier(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='auc', gamm...da=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, xgbclassifier__silent=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'xgbclassifier__max_depth': [3, 4, 5, 6, 8], 'xgbclassifier__min_child_weight': [1, 3, 5], 'xgbclassifier__gamma': [0.0, 0.1, 0.2, 10], 'xgbclassifier__colsample_bytree': [0.3, 0.4, 0.5, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [32]:
validation_score = gs.best_score_
print()
print('Cross-Validation Score:', -validation_score)
print()
print('Best estimator:', gs.best_estimator_)
print()
test_score = gs.score(X_train, y_train)
print('Test Score:', test_score)


Cross-Validation Score: -0.750975132111945

Best estimator: Pipeline(memory=None,
     steps=[('binaryencoder', BinaryEncoder(cols=['term', 'home_ownership', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method'],
       drop_invariant=False, handle_unknown='impute', impute_missing=True,
       return_df=True, verbose=0)), ('xgbclassifier', XGBClass...da=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgbclassifier__silent=False))])

Test Score: 0.8244888995124718


In [33]:
pd.DataFrame(gs.cv_results_).sort_values(by='rank_test_score').head(50)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbclassifier__colsample_bytree,param_xgbclassifier__gamma,param_xgbclassifier__max_depth,param_xgbclassifier__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
95,2.145797,0.011115,0.234217,0.005814,0.4,0.2,4,5,"{'xgbclassifier__colsample_bytree': 0.4, 'xgbc...",0.754876,0.743713,0.754336,0.750975,0.005139,1,0.844912,0.851157,0.841278,0.845782,0.00408
8,2.199348,0.052663,0.232093,0.001122,0.3,0.0,5,5,"{'xgbclassifier__colsample_bytree': 0.3, 'xgbc...",0.754679,0.742674,0.754671,0.750675,0.005658,2,0.883429,0.8908,0.887325,0.887185,0.003011
83,2.508449,0.042052,0.229943,0.000197,0.4,0.1,5,5,"{'xgbclassifier__colsample_bytree': 0.4, 'xgbc...",0.752212,0.744748,0.754684,0.750548,0.004224,3,0.88858,0.89546,0.890444,0.891494,0.002905
38,2.143545,0.059622,0.242335,0.017971,0.3,0.2,5,5,"{'xgbclassifier__colsample_bytree': 0.3, 'xgbc...",0.75526,0.743038,0.753326,0.750541,0.005364,4,0.885629,0.890939,0.884504,0.887024,0.002806
128,4.063176,1.392295,0.239713,0.011024,0.5,0.0,5,5,"{'xgbclassifier__colsample_bytree': 0.5, 'xgbc...",0.757392,0.741651,0.752171,0.750405,0.006546,5,0.895746,0.899036,0.892337,0.895706,0.002735
140,2.89983,0.490635,0.239523,0.006524,0.5,0.1,4,5,"{'xgbclassifier__colsample_bytree': 0.5, 'xgbc...",0.756345,0.745795,0.74854,0.750227,0.004469,6,0.84857,0.851542,0.845244,0.848452,0.002573
68,2.494793,0.040825,0.235597,0.004215,0.4,0.0,5,5,"{'xgbclassifier__colsample_bytree': 0.4, 'xgbc...",0.75298,0.74244,0.7552,0.750207,0.005566,7,0.887192,0.897799,0.890017,0.891669,0.004485
23,2.191853,0.041788,0.231084,0.001114,0.3,0.1,5,5,"{'xgbclassifier__colsample_bytree': 0.3, 'xgbc...",0.75264,0.744277,0.753369,0.750096,0.004125,8,0.884275,0.893402,0.885874,0.887851,0.003979
37,2.17177,0.034581,0.231304,0.001593,0.3,0.2,5,3,"{'xgbclassifier__colsample_bytree': 0.3, 'xgbc...",0.754136,0.744008,0.752122,0.750089,0.004378,9,0.89317,0.898878,0.886813,0.892954,0.004928
125,2.974656,0.575371,0.246422,0.004832,0.5,0.0,4,5,"{'xgbclassifier__colsample_bytree': 0.5, 'xgbc...",0.755218,0.745789,0.748905,0.749971,0.003922,10,0.847857,0.851546,0.846688,0.848697,0.00207
