In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import gc

In [2]:
train = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
prev = pd.read_csv('./input/previous_application.csv')
buro = pd.read_csv('./input/bureau.csv')

In [3]:
categorical_feats = [
    f for f in train.columns if train[f].dtype == 'object'
]

In [4]:
for f in categorical_feats:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [5]:
gc.enable()

In [6]:
y = train['TARGET']
del train['TARGET']

In [7]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [8]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [9]:
buro_cat_features = [
    f_ for f_ in buro.columns if buro[f_].dtype == 'object'
]
for f_ in buro_cat_features:
    buro[f_], _ = pd.factorize(buro[f_])

avg_buro = buro.groupby('SK_ID_CURR').mean()
avg_buro['buro_count'] = buro[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
del avg_buro['SK_ID_BUREAU']

In [10]:
train = train.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
train = train.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')


In [11]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in train.columns if f_ not in excluded_feats]

In [12]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

In [13]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    clf = LGBMClassifier(
        n_estimators=20000,
        learning_rate=0.005,
        num_leaves=70,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        device="gpu"
    )
    
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=250, early_stopping_rounds=150
           )
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
    
print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))   

test['TARGET'] = sub_preds

test[['SK_ID_CURR', 'TARGET']].to_csv('submits/first_submission.csv', index=False, float_format='%.8f')

Training until validation scores don't improve for 150 rounds.
[250]	training's auc: 0.770254	valid_1's auc: 0.748158
[500]	training's auc: 0.789974	valid_1's auc: 0.757465
[750]	training's auc: 0.806531	valid_1's auc: 0.764054
[1000]	training's auc: 0.819404	valid_1's auc: 0.767972
[1250]	training's auc: 0.829609	valid_1's auc: 0.770193
[1500]	training's auc: 0.838134	valid_1's auc: 0.771626
[1750]	training's auc: 0.84613	valid_1's auc: 0.772677
[2000]	training's auc: 0.853338	valid_1's auc: 0.773411
[2250]	training's auc: 0.860023	valid_1's auc: 0.774008
[2500]	training's auc: 0.866237	valid_1's auc: 0.774371
[2750]	training's auc: 0.871961	valid_1's auc: 0.774473
[3000]	training's auc: 0.877506	valid_1's auc: 0.774687
[3250]	training's auc: 0.882837	valid_1's auc: 0.774824
[3500]	training's auc: 0.887755	valid_1's auc: 0.775035
[3750]	training's auc: 0.892328	valid_1's auc: 0.775107
[4000]	training's auc: 0.896822	valid_1's auc: 0.77508
Early stopping, best iteration is:
[3959]	trai