In [19]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier 
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline  

## Read the training data

In [20]:
app_train = pd.read_csv("data/application_train.csv")
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
target = app_train.TARGET
app_train.drop(['SK_ID_CURR','TARGET'],inplace=True,axis=1)
app_train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,...,0,0,0,0,,,,,,
4,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
cat_cols = app_train.select_dtypes(exclude=['int64','float64']).columns.values
cat_cols_idx =  [app_train.columns.get_loc(c) for c in app_train.columns if c in cat_cols]
print(cat_cols)
[app_train[c].fillna("XXX",inplace = True) for c in cat_cols]
app_train[cat_cols].head()


['NAME_CONTRACT_TYPE' 'CODE_GENDER' 'FLAG_OWN_CAR' 'FLAG_OWN_REALTY'
 'NAME_TYPE_SUITE' 'NAME_INCOME_TYPE' 'NAME_EDUCATION_TYPE'
 'NAME_FAMILY_STATUS' 'NAME_HOUSING_TYPE' 'OCCUPATION_TYPE'
 'WEEKDAY_APPR_PROCESS_START' 'ORGANIZATION_TYPE' 'FONDKAPREMONT_MODE'
 'HOUSETYPE_MODE' 'WALLSMATERIAL_MODE' 'EMERGENCYSTATE_MODE']


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,reg oper account,block of flats,"Stone, brick",No
1,Cash loans,F,N,N,Family,State servant,Higher education,Married,House / apartment,Core staff,MONDAY,School,reg oper account,block of flats,Block,No
2,Revolving loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,MONDAY,Government,XXX,XXX,XXX,XXX
3,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,XXX,XXX,XXX,XXX
4,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Core staff,THURSDAY,Religion,XXX,XXX,XXX,XXX


Read the test data

In [23]:
app_test = pd.read_csv("data/application_test.csv")
app_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [24]:
sk_id_curr = app_test['SK_ID_CURR']
app_test.drop(['SK_ID_CURR'],inplace=True,axis=1)
[app_test[c].fillna("XXX",inplace = True) for c in cat_cols]
app_test[cat_cols].head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE
0,Cash loans,F,N,Y,Unaccompanied,Working,Higher education,Married,House / apartment,XXX,TUESDAY,Kindergarten,XXX,block of flats,"Stone, brick",No
1,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,Low-skill Laborers,FRIDAY,Self-employed,XXX,XXX,XXX,XXX
2,Cash loans,M,Y,Y,XXX,Working,Higher education,Married,House / apartment,Drivers,MONDAY,Transport: type 3,XXX,XXX,XXX,XXX
3,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,Sales staff,WEDNESDAY,Business Entity Type 3,reg oper account,block of flats,Panel,No
4,Cash loans,M,Y,N,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,XXX,FRIDAY,Business Entity Type 3,XXX,XXX,XXX,XXX


In [25]:
def gini_normalized(y_actual, y_pred):
    """Simple normalized Gini based on Scikit-Learn's roc_auc_score"""
    gini = lambda a, p: 2 * roc_auc_score(a, p) - 1
    return gini(y_actual, y_pred) / gini(y_actual, y_actual)

In [26]:
seed_val = 2018
num_folds = 5
pred_val = np.zeros(app_test.shape[0])
# Cross validation model
folds = StratifiedShuffleSplit(n_splits= num_folds, random_state=seed_val)
        
    # Create arrays and dataframes to store results
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(app_train, target)):
    cat_X_train, cat_y_train = app_train.values[train_idx], target[train_idx]
    cat_X_valid, cat_y_valid = app_train.values[valid_idx], target[valid_idx]
        
    print("Running fold ", n_fold+1)
    model = CatBoostClassifier(iterations=2000,
                                      learning_rate=0.02,
                                      depth=6,
                                      l2_leaf_reg=40,
                                      bootstrap_type='Bernoulli',
                                      subsample=0.8715623,
                                      scale_pos_weight=5,
                                      eval_metric='AUC',
                                      metric_period=100,
                                      od_type='Iter',
                                      od_wait=45,
                                      random_seed=seed_val,
                                     allow_writing_files=False)
    
    fit_model = model.fit(cat_X_train, cat_y_train,
                     eval_set=(cat_X_valid, cat_y_valid), 
                     cat_features=cat_cols_idx,
                     use_best_model=True,
                     verbose=True,plot=False)
    
    pred_val += fit_model.predict_proba(app_test)[:,1]
    print('fold:', n_fold+1, '/', num_folds,
                 ' -> oof gini score:', gini_normalized(cat_y_valid, fit_model.predict_proba(cat_X_valid)[:,1]))

pred_val = pred_val/num_folds

Running fold  1




0:	test: 0.6978750	best: 0.6978750 (0)	total: 1.09s	remaining: 36m 26s
100:	test: 0.7366811	best: 0.7366811 (100)	total: 1m 19s	remaining: 24m 52s
200:	test: 0.7465136	best: 0.7465136 (200)	total: 2m 36s	remaining: 23m 22s
300:	test: 0.7513568	best: 0.7513568 (300)	total: 3m 54s	remaining: 22m 4s
400:	test: 0.7537242	best: 0.7537242 (400)	total: 5m 12s	remaining: 20m 46s
500:	test: 0.7547925	best: 0.7547925 (500)	total: 6m 28s	remaining: 19m 23s
600:	test: 0.7557218	best: 0.7557602 (594)	total: 7m 48s	remaining: 18m 10s
700:	test: 0.7564349	best: 0.7564349 (700)	total: 9m 6s	remaining: 16m 52s
800:	test: 0.7570634	best: 0.7570634 (800)	total: 10m 24s	remaining: 15m 34s
900:	test: 0.7575382	best: 0.7575382 (900)	total: 11m 39s	remaining: 14m 13s
1000:	test: 0.7579244	best: 0.7579375 (999)	total: 12m 53s	remaining: 12m 52s
1100:	test: 0.7582935	best: 0.7583314 (1085)	total: 14m 8s	remaining: 11m 32s
1200:	test: 0.7586144	best: 0.7586280 (1194)	total: 15m 24s	remaining: 10m 15s
1300:	test



0:	test: 0.7121874	best: 0.7121874 (0)	total: 865ms	remaining: 28m 48s
100:	test: 0.7464422	best: 0.7464422 (100)	total: 1m 26s	remaining: 26m 58s
200:	test: 0.7541494	best: 0.7541494 (200)	total: 2m 56s	remaining: 26m 22s
300:	test: 0.7579650	best: 0.7579650 (300)	total: 4m 21s	remaining: 24m 38s
400:	test: 0.7600390	best: 0.7600406 (399)	total: 5m 48s	remaining: 23m 8s
500:	test: 0.7609649	best: 0.7609704 (499)	total: 7m 11s	remaining: 21m 30s
600:	test: 0.7619329	best: 0.7619334 (598)	total: 8m 41s	remaining: 20m 13s
700:	test: 0.7627700	best: 0.7627700 (700)	total: 10m 13s	remaining: 18m 57s
800:	test: 0.7633576	best: 0.7633576 (800)	total: 11m 36s	remaining: 17m 22s
900:	test: 0.7637772	best: 0.7637772 (900)	total: 13m	remaining: 15m 52s
1000:	test: 0.7641516	best: 0.7641516 (1000)	total: 14m 25s	remaining: 14m 23s
1100:	test: 0.7646235	best: 0.7646235 (1100)	total: 15m 51s	remaining: 12m 56s
1200:	test: 0.7648727	best: 0.7648755 (1198)	total: 17m 18s	remaining: 11m 30s
1300:	test



0:	test: 0.6991625	best: 0.6991625 (0)	total: 1s	remaining: 33m 25s
100:	test: 0.7411331	best: 0.7411331 (100)	total: 1m 26s	remaining: 27m 14s
200:	test: 0.7516969	best: 0.7516969 (200)	total: 2m 47s	remaining: 25m 1s
300:	test: 0.7561928	best: 0.7561928 (300)	total: 4m 10s	remaining: 23m 32s
400:	test: 0.7582980	best: 0.7582980 (400)	total: 5m 34s	remaining: 22m 13s
500:	test: 0.7596699	best: 0.7596699 (500)	total: 6m 56s	remaining: 20m 46s
600:	test: 0.7605208	best: 0.7605208 (600)	total: 8m 17s	remaining: 19m 18s
700:	test: 0.7612515	best: 0.7612515 (700)	total: 9m 44s	remaining: 18m 2s
800:	test: 0.7617338	best: 0.7617338 (800)	total: 11m 5s	remaining: 16m 36s
900:	test: 0.7622301	best: 0.7622340 (898)	total: 12m 25s	remaining: 15m 9s
1000:	test: 0.7627023	best: 0.7627217 (999)	total: 13m 48s	remaining: 13m 47s
1100:	test: 0.7630687	best: 0.7630804 (1097)	total: 15m 8s	remaining: 12m 22s
1200:	test: 0.7633010	best: 0.7633010 (1200)	total: 16m 27s	remaining: 10m 56s
1300:	test: 0.7



0:	test: 0.7063902	best: 0.7063902 (0)	total: 976ms	remaining: 32m 31s
100:	test: 0.7424884	best: 0.7424884 (100)	total: 1m 23s	remaining: 26m 17s
200:	test: 0.7529196	best: 0.7529196 (200)	total: 2m 51s	remaining: 25m 31s
300:	test: 0.7572596	best: 0.7572596 (300)	total: 4m 8s	remaining: 23m 24s
400:	test: 0.7595510	best: 0.7595510 (400)	total: 5m 28s	remaining: 21m 48s
500:	test: 0.7607880	best: 0.7607895 (499)	total: 6m 50s	remaining: 20m 28s
600:	test: 0.7615767	best: 0.7615786 (599)	total: 8m 10s	remaining: 19m 2s
700:	test: 0.7623286	best: 0.7623286 (700)	total: 9m 29s	remaining: 17m 35s
800:	test: 0.7629388	best: 0.7629531 (799)	total: 10m 46s	remaining: 16m 8s
900:	test: 0.7634442	best: 0.7634442 (900)	total: 12m 4s	remaining: 14m 43s
1000:	test: 0.7638133	best: 0.7638232 (999)	total: 13m 21s	remaining: 13m 20s
1100:	test: 0.7640384	best: 0.7640384 (1100)	total: 14m 41s	remaining: 11m 59s
1200:	test: 0.7643616	best: 0.7643616 (1200)	total: 15m 59s	remaining: 10m 38s
1300:	test:



0:	test: 0.7026259	best: 0.7026259 (0)	total: 836ms	remaining: 27m 50s
100:	test: 0.7440240	best: 0.7440240 (100)	total: 1m 17s	remaining: 24m 20s
200:	test: 0.7536860	best: 0.7536860 (200)	total: 2m 33s	remaining: 22m 52s
300:	test: 0.7581621	best: 0.7581738 (299)	total: 3m 49s	remaining: 21m 32s
400:	test: 0.7606374	best: 0.7606374 (400)	total: 5m 3s	remaining: 20m 11s
500:	test: 0.7621092	best: 0.7621092 (500)	total: 6m 19s	remaining: 18m 53s
600:	test: 0.7632736	best: 0.7632736 (600)	total: 7m 33s	remaining: 17m 34s
700:	test: 0.7639962	best: 0.7640097 (695)	total: 8m 49s	remaining: 16m 21s
800:	test: 0.7645528	best: 0.7645528 (800)	total: 10m 4s	remaining: 15m 5s
900:	test: 0.7649834	best: 0.7649834 (900)	total: 11m 19s	remaining: 13m 48s
1000:	test: 0.7655863	best: 0.7656016 (994)	total: 12m 34s	remaining: 12m 32s
1100:	test: 0.7660650	best: 0.7660662 (1094)	total: 13m 49s	remaining: 11m 17s
1200:	test: 0.7663471	best: 0.7663471 (1200)	total: 15m 7s	remaining: 10m 3s
1300:	test: 

In [27]:
sub_fm = pd.DataFrame({"SK_ID_CURR":sk_id_curr, "TARGET":pred_val})
sub_fm.to_csv("./submit/catboost.csv",index=False)