## Import modules

In [70]:
import time
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import seaborn
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier # this is an sklearn wrapper for XGBoost. This allows us to use sklearn’s Grid Search with parallel processing in the same way we did for GBM
from sklearn import cross_validation, metrics
from sklearn.cross_validation import train_test_split

import lime

%xmode Verbose
%matplotlib inline

Exception reporting mode: Verbose


## Import dataset

In [71]:
filename='./data/20180319_L1 Simple  Dataset_v2.csv'
df=pd.read_csv(train_filename,header=0,delimiter=";",decimal="," ,low_memory=False)
df.head(3)

Unnamed: 0,id_pres,Badness30D,Prior_delinquency,L0_Score,L0_Extensions,Equifax_L1,Equifax_L0,Equifax_Diff,FirstL1_Time_Density,L1_Cais,...,Repayment_to_limit,Credit_Utilization,PriorL1_declines,L0_Interest_Paid,L0_cuota_pres,L0_cuota_pres_original,total_amount,slider_days_modified,slider_amount_modified,credit_limit
0,996867,1,-11,0.321284,0,4,4,0.0,0,0.0,...,0.987133,0.666667,4,1.0,5.0,5.0,296.14,1,1,300
1,1000948,0,-8,0.236901,0,4,4,0.0,14,0.0,...,0.979,0.666667,2,1.0,5.0,5.0,293.7,1,1,300
2,1002036,0,-3,0.33181,0,4,4,0.0,29,0.0,...,1.4317,1.0,1,1.0,5.0,5.0,429.51,1,1,300


## Set target and features

In [72]:
col_id              = 'id_pres'
col_target          = 'Badness30D'
col_features = [x for x in df.columns if x not in [col_id,col_target]]
col_features

['Prior_delinquency',
 'L0_Score',
 'L0_Extensions',
 'Equifax_L1',
 'Equifax_L0',
 'Equifax_Diff',
 'FirstL1_Time_Density',
 'L1_Cais',
 'L0_Cais',
 'Cais_Diff',
 'CurrentL1_Time_Density',
 'Repayment_to_limit',
 'Credit_Utilization',
 'PriorL1_declines',
 'L0_Interest_Paid',
 'L0_cuota_pres',
 'L0_cuota_pres_original',
 'total_amount',
 'slider_days_modified',
 'slider_amount_modified',
 'credit_limit']

## Split target and features

In [73]:
Y = df[col_target]
X = df[col_features]
Y.astype('int32')
print(Y.shape)
print(Y.head(3))
print(X.shape)
print(X.head(3))

(2417,)
0    1
1    0
2    0
Name: Badness30D, dtype: int64
(2417, 21)
   Prior_delinquency  L0_Score  L0_Extensions  Equifax_L1  Equifax_L0  \
0                -11  0.321284              0           4           4   
1                 -8  0.236901              0           4           4   
2                 -3  0.331810              0           4           4   

   Equifax_Diff  FirstL1_Time_Density  L1_Cais  L0_Cais  Cais_Diff  \
0           0.0                     0      0.0      0.0        0.0   
1           0.0                    14      0.0      0.0        0.0   
2           0.0                    29      0.0      0.0        0.0   

       ...       Repayment_to_limit  Credit_Utilization  PriorL1_declines  \
0      ...                 0.987133            0.666667                 4   
1      ...                 0.979000            0.666667                 2   
2      ...                 1.431700            1.000000                 1   

   L0_Interest_Paid  L0_cuota_pres  L0_cuota_p

## Split training set and test set

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,shuffle=True)
print(X_train.shape)
print(X_train.head(3))
print(X_test.shape)
print(X_test.head(3))

(1933, 21)
     Prior_delinquency  L0_Score  L0_Extensions  Equifax_L1  Equifax_L0  \
347                  1  0.254186              0           3           3   
882                -13  0.249980              0           3           1   
470                 -3  0.436098              0           6           5   

     Equifax_Diff  FirstL1_Time_Density  L1_Cais  L0_Cais  Cais_Diff  \
347           0.0                     1      0.0      0.0        0.0   
882          -2.0                     7      0.0      0.0        0.0   
470          -1.0                     0      1.0      1.0        0.0   

         ...       Repayment_to_limit  Credit_Utilization  PriorL1_declines  \
347      ...                 1.018000            0.866667                 3   
882      ...                 1.138222            0.833333                 1   
470      ...                 0.767867            0.600000                 4   

     L0_Interest_Paid  L0_cuota_pres  L0_cuota_pres_original  total_amount  \
347 

In [75]:
start_time = time.time()

## Preprocessing features

## XGBOOST

In [76]:
# declare classifier wrapper
clf = XGBClassifier(
    booster = 'gbtree',
    learning_rate =0.01,
    n_estimators=3000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.7,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=99)

# retrieve params
xgb_param = clf.get_xgb_params()

In [77]:
xgtrain = xgb.DMatrix(X_train.as_matrix(), label=Y_train.as_matrix())

In [78]:
xgtest  = xgb.DMatrix(X_test.as_matrix())

In [79]:
cv_folds = 5
early_stopping_rounds = 50

In [80]:
print ('\nInitializing cross-validation...')
cvresult = xgb.cv(
    xgb_param,
    xgtrain,
    num_boost_round=clf.get_params()['n_estimators'],
    nfold=cv_folds,
    metrics='auc',
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=1)


Initializing cross-validation...
[0]	train-auc:0.692228+0.00852824	test-auc:0.613404+0.0362906
[1]	train-auc:0.724028+0.0139175	test-auc:0.633972+0.0268988
[2]	train-auc:0.74495+0.0070664	test-auc:0.639744+0.01752
[3]	train-auc:0.76278+0.00449822	test-auc:0.642737+0.0143473
[4]	train-auc:0.777723+0.00349305	test-auc:0.63966+0.0160836
[5]	train-auc:0.78642+0.00427298	test-auc:0.636816+0.0171957
[6]	train-auc:0.791989+0.00898467	test-auc:0.636426+0.0190909
[7]	train-auc:0.796336+0.008977	test-auc:0.640053+0.0187315
[8]	train-auc:0.800661+0.00619043	test-auc:0.636897+0.0187432
[9]	train-auc:0.803684+0.00874536	test-auc:0.636961+0.0159168
[10]	train-auc:0.806148+0.00763781	test-auc:0.634276+0.0171151
[11]	train-auc:0.806446+0.00750605	test-auc:0.638413+0.0193364
[12]	train-auc:0.807502+0.00633099	test-auc:0.637372+0.0202854
[13]	train-auc:0.810356+0.0066155	test-auc:0.6403+0.0209711
[14]	train-auc:0.811318+0.00722182	test-auc:0.639789+0.0191228
[15]	train-auc:0.812652+0.00733155	test-auc:

[131]	train-auc:0.861574+0.0083148	test-auc:0.650164+0.0151317
[132]	train-auc:0.861721+0.00833857	test-auc:0.649871+0.0152257
[133]	train-auc:0.862054+0.00836234	test-auc:0.650127+0.0151429
[134]	train-auc:0.862365+0.00820181	test-auc:0.650121+0.0155416
[135]	train-auc:0.862701+0.008288	test-auc:0.650327+0.015301
[136]	train-auc:0.862954+0.00821589	test-auc:0.650445+0.0152091
[137]	train-auc:0.86327+0.00798675	test-auc:0.650169+0.0151749
[138]	train-auc:0.863425+0.00794154	test-auc:0.650316+0.0156576
[139]	train-auc:0.863735+0.00755809	test-auc:0.65061+0.0160148
[140]	train-auc:0.86409+0.00759184	test-auc:0.650759+0.0158093
[141]	train-auc:0.864306+0.00754062	test-auc:0.651107+0.0158752
[142]	train-auc:0.864499+0.00748211	test-auc:0.651344+0.0154729
[143]	train-auc:0.864746+0.00730386	test-auc:0.651479+0.015375
[144]	train-auc:0.864942+0.00744125	test-auc:0.651305+0.0153107
[145]	train-auc:0.865189+0.0073635	test-auc:0.651117+0.0153146
[146]	train-auc:0.865602+0.00733822	test-auc:0.65

[260]	train-auc:0.893897+0.00604294	test-auc:0.654945+0.0149836
[261]	train-auc:0.894246+0.00605448	test-auc:0.654937+0.0150485
[262]	train-auc:0.894438+0.00597478	test-auc:0.655137+0.0151676
[263]	train-auc:0.894537+0.00586248	test-auc:0.655171+0.0154765
[264]	train-auc:0.894739+0.00580525	test-auc:0.655206+0.0155015
[265]	train-auc:0.894884+0.00573479	test-auc:0.655239+0.0153798
[266]	train-auc:0.89511+0.00572771	test-auc:0.655251+0.0155388
[267]	train-auc:0.89549+0.00567709	test-auc:0.655399+0.0157407
[268]	train-auc:0.895665+0.00568839	test-auc:0.655681+0.015792
[269]	train-auc:0.895946+0.00563986	test-auc:0.655908+0.0158833
[270]	train-auc:0.896168+0.0056952	test-auc:0.656164+0.0157466
[271]	train-auc:0.89644+0.00562532	test-auc:0.656178+0.0155948
[272]	train-auc:0.896753+0.00575589	test-auc:0.655988+0.0156492
[273]	train-auc:0.896877+0.0056791	test-auc:0.656017+0.0157571
[274]	train-auc:0.897091+0.00561226	test-auc:0.655985+0.0156998
[275]	train-auc:0.897415+0.00551024	test-auc:0

[390]	train-auc:0.920663+0.00268175	test-auc:0.658998+0.0144506
[391]	train-auc:0.92079+0.00271752	test-auc:0.658994+0.0144
[392]	train-auc:0.920914+0.00273754	test-auc:0.659056+0.014615
[393]	train-auc:0.921078+0.00263445	test-auc:0.659012+0.0146115
[394]	train-auc:0.921266+0.00265598	test-auc:0.659142+0.0145105
[395]	train-auc:0.921378+0.00259772	test-auc:0.659139+0.0144658
[396]	train-auc:0.921587+0.00260271	test-auc:0.659016+0.014585
[397]	train-auc:0.921749+0.00257668	test-auc:0.659082+0.014644
[398]	train-auc:0.921898+0.00258615	test-auc:0.659123+0.014671
[399]	train-auc:0.922049+0.00260488	test-auc:0.65912+0.01457
[400]	train-auc:0.922235+0.00261521	test-auc:0.659148+0.0144273
[401]	train-auc:0.922304+0.00259594	test-auc:0.659053+0.0143035
[402]	train-auc:0.922487+0.00267409	test-auc:0.659097+0.0144007
[403]	train-auc:0.92265+0.00275627	test-auc:0.658752+0.0147325
[404]	train-auc:0.922848+0.00282896	test-auc:0.658618+0.0146728
[405]	train-auc:0.923097+0.00291426	test-auc:0.65875

## Retrieve parameters

In [81]:
clf.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=426,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=99, silent=True,
       subsample=0.7)

## Fit to training data

In [82]:
print('\nFit algorithm on train data...')
clf.fit(X_train.as_matrix(), Y_train.as_matrix(), eval_metric='auc')


Fit algorithm on train data...


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=426,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=99, silent=True,
       subsample=0.7)

## Predict training set

In [83]:
print('\nPredicting on training set...')
dtrain_predictions = clf.predict(X_train.as_matrix())
dtrain_predprob = clf.predict_proba(X_train.as_matrix())[:,1]


Predicting on training set...


## Print model report

In [84]:
def ip(y_target, y_pred):
    return 100*(2*(metrics.roc_auc_score(y_target, y_pred))-1)
print('Model Report')
print('Accuracy : %.4g' % metrics.accuracy_score(Y_train.values, dtrain_predictions))
print('AUC Score (Train): %f' % metrics.roc_auc_score(Y_train, dtrain_predprob))
print('IP Score  (Train): %f' % ip(Y_train, dtrain_predprob))
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

Model Report
Accuracy : 0.8267
AUC Score (Train): 0.913005
IP Score  (Train): 82.600997
Training time: 0.12 minutes


## Print train predictions 

In [85]:
print('\nPredicting on test set...')
dtest_predictions = clf.predict(X_test.as_matrix())
dtest_predprob = clf.predict_proba(X_test.as_matrix())[:,1]


Predicting on test set...


## Print test predictions

In [66]:
print('Model Report')
print('Accuracy : %.4g' % metrics.accuracy_score(Y_test.values, dtest_predictions))
print('AUC Score (Train): %f' % metrics.roc_auc_score(Y_test, dtest_predprob))

Model Report
Accuracy : 0.7686
AUC Score (Train): 0.740327


## Plot importances


Importances...


NameError: name 'features' is not defined