# MODELING

In [1]:
import os
import pandas as pd

from feature_engineering import *
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = '../HomeCreditDefaultRisk/Data'

In [3]:
tables = [pd.read_csv(os.path.join(DATA_PATH, table)) for table in os.listdir(DATA_PATH) if table.endswith('csv.zip')]
ins, prev, bureau, bb, df, cc, test_df, pos = tables

In [4]:
# EXECUTE FE

limit = None

app_train = application_train_test(df, test_df, nan_as_category = False, limit=limit)
bureau_agg = bureau_and_balance(bureau, bb, nan_as_category = True, limit=limit)
prev_agg = previous_applications(prev, nan_as_category = True, limit=limit)
pos_agg = pos_cash(pos, nan_as_category = True, limit=limit)
ins_agg = installments_payments(ins, nan_as_category = True, limit=limit)
cc_agg = credit_card_balance(cc, nan_as_category = True, limit=limit)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan


In [5]:
pos_agg = pos_agg.reset_index()
ins_agg = ins_agg.reset_index()
cc_agg = cc_agg.reset_index()

In [6]:
# MERGE ALL

def merge(df,bureau_agg,prev_agg,ins_agg,pos_agg,cc_agg):
    
    df = (df
           .merge(bureau_agg, how='left', on='SK_ID_CURR')
           .merge(prev_agg, how='left', on='SK_ID_CURR')
           .merge(pos_agg, how='left', on='SK_ID_CURR')
           .merge(ins_agg, how='left', on='SK_ID_CURR')
           .merge(cc_agg, how='left', on='SK_ID_CURR')
          )
    
    del bureau_agg
    del prev_agg
    del pos_agg
    del ins_agg
    del cc_agg
    
    return df

In [7]:
# Merge all sources
app_train = merge(app_train, bureau_agg, prev_agg, ins_agg, pos_agg, cc_agg)

In [8]:
# Share of mising values
app_train['SHARE_NAN'] = app_train.isnull().sum(axis=1) / app_train.shape[1]

In [9]:
# Split train and test

train_df = app_train[app_train['TARGET'].notnull()]
test_df = app_train[app_train['TARGET'].isnull()]
del app_train

In [10]:
# Fillna by a single value to give a signal

train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

assert train_df.isnull().sum().sum() == test_df.isnull().sum().sum() == 0

### Evaluation
#### LIGHTGBM

In [20]:
def kfold(train_df, test_df, num_folds, stratified = False, submission=True):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
        
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
        
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
            )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    if submission:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv("../HomeCreditDefaultRisk/Data/submissions.csv", index= False)

In [21]:
kfold(train_df, test_df, 5, stratified = False, submission=True)

Starting LightGBM. Train shape: (307507, 735), test shape: (48744, 735)
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.755297	valid_1's auc: 0.747522
[200]	training's auc: 0.779459	valid_1's auc: 0.765235
[300]	training's auc: 0.798199	valid_1's auc: 0.777609
[400]	training's auc: 0.810347	valid_1's auc: 0.783229
[500]	training's auc: 0.820013	valid_1's auc: 0.786683
[600]	training's auc: 0.828232	valid_1's auc: 0.78892
[700]	training's auc: 0.835321	valid_1's auc: 0.790367
[800]	training's auc: 0.841831	valid_1's auc: 0.791452
[900]	training's auc: 0.847985	valid_1's auc: 0.792182
[1000]	training's auc: 0.853746	valid_1's auc: 0.792603
[1100]	training's auc: 0.858945	valid_1's auc: 0.79313
[1200]	training's auc: 0.863943	valid_1's auc: 0.793454
[1300]	training's auc: 0.868796	valid_1's auc: 0.793859
[1400]	training's auc: 0.873431	valid_1's auc: 0.794041
[1500]	training's auc: 0.877453	valid_1's auc: 0.794126
[1600]	training's auc: 0.881445	vali