In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


from lightgbm import LGBMClassifier
import gc
import os
import re

In [53]:
def get_experiment_id():
    files = [f for f in os.listdir('./submits/experiments')]
    files.sort()
    files.reverse()
    for f in files:
        m = re.match('(\d+).csv', f)
        if m:
            last_id = int(m.group(1)) + 1
            return f"{last_id:03d}"
    return '001'

In [2]:
train = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
prev = pd.read_csv('./input/previous_application.csv')
buro = pd.read_csv('./input/bureau.csv')

In [3]:
categorical_feats = [
    f for f in train.columns if train[f].dtype == 'object'
]

In [4]:
for f in categorical_feats:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [5]:
gc.enable()

In [6]:
y = train['TARGET']
del train['TARGET']

In [7]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [8]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [9]:
buro_cat_features = [
    f_ for f_ in buro.columns if buro[f_].dtype == 'object'
]
for f_ in buro_cat_features:
    buro[f_], _ = pd.factorize(buro[f_])

avg_buro = buro.groupby('SK_ID_CURR').mean()
avg_buro['buro_count'] = buro[['SK_ID_BUREAU','SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
del avg_buro['SK_ID_BUREAU']

In [10]:
train = train.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
train = train.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')

test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')


In [11]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in train.columns if f_ not in excluded_feats]

In [24]:
trn_x, val_x, trn_y, val_y = train_test_split(train[features], y,  test_size=0.2, random_state=42)

In [64]:
csv = f"submits/experiments/{get_experiment_id()}.csv"

clf = LGBMClassifier(
    n_estimators=20000,
    learning_rate=0.005,
    num_leaves=70,
    colsample_bytree=.8,
    subsample=.9,
    max_depth=7,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2,
    device="gpu"
)

clf.fit(trn_x, trn_y, 
        eval_set= [(trn_x, trn_y), (val_x, val_y)], 
        eval_metric='auc', verbose=250, early_stopping_rounds=150
       )

val_preds = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
test_preds = clf.predict_proba(test[features], num_iteration=clf.best_iteration_)[:, 1]

print('AUC : %.6f' % (roc_auc_score(val_y, val_preds)))
gc.collect()
    
test['TARGET'] = test_preds
test[['SK_ID_CURR', 'TARGET']].to_csv(csv, index=False, float_format='%.8f')
print(f'Save to {csv}')

Training until validation scores don't improve for 150 rounds.
[250]	training's auc: 0.770247	valid_1's auc: 0.747771
[500]	training's auc: 0.789973	valid_1's auc: 0.757256
[750]	training's auc: 0.806526	valid_1's auc: 0.763826
[1000]	training's auc: 0.819141	valid_1's auc: 0.767792
[1250]	training's auc: 0.829232	valid_1's auc: 0.770017
[1500]	training's auc: 0.837954	valid_1's auc: 0.771599
[1750]	training's auc: 0.845771	valid_1's auc: 0.77263
[2000]	training's auc: 0.852802	valid_1's auc: 0.773369
[2250]	training's auc: 0.859448	valid_1's auc: 0.773979
[2500]	training's auc: 0.865678	valid_1's auc: 0.774465
[2750]	training's auc: 0.871486	valid_1's auc: 0.774866
[3000]	training's auc: 0.876922	valid_1's auc: 0.77508
[3250]	training's auc: 0.882308	valid_1's auc: 0.775185
[3500]	training's auc: 0.887319	valid_1's auc: 0.77531
[3750]	training's auc: 0.892219	valid_1's auc: 0.775422
[4000]	training's auc: 0.896777	valid_1's auc: 0.775366
Early stopping, best iteration is:
[3864]	train