In [160]:
import datetime
import gc
import os
import time
import numpy as np
import pandas as pd
import json

from colorama import Fore, Style

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from bayes_opt import BayesianOptimization

from utils import ModelWrapper
from utils.preprocess import load_all
from utils.utils import (
    highlight_print, timer, submit, calculate_feature_importance, load_feats,
)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [205]:
def exp_feats(df):
    df['BAO_CREDIT__ANNUNITY'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_ANNUITY']
    df['BAO_CREDIT__INC'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_INCOME_TOTAL']
    df['BAO_CREDIT__CREDIT'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_CREDIT']
    df['BAO_CREDIT__GOODS'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_GOODS_PRICE']
    
    df['PAO_CREDIT__ANNUNITY'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_ANNUITY']
    df['PAO_CREDIT__INC'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_CREDIT__CREDIT'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_CREDIT']
    df['PAO_CREDIT__GOODS'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_GOODS_PRICE']
    
    df['PAO_ANNUNITY'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_ANNUITY']
    df['PAO_ANNUNITY__INC'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_ANNUNITY__CREDIT'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_CREDIT']
    
    df["IBO_PAYMENT__ACREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['ACTIVE_AMT_CREDIT_SUM_SUM']
    df["IBO_PAYMENT__CCREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['CLOSED_AMT_CREDIT_SUM_SUM']
    df["IAO_PAYMENT__ANNUITY"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_ANNUITY']
    df["IAO_PAYMENT__INC"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_INCOME_TOTAL']
    
    # New try, failed
#     df['IAX_DPD__ANNUITY'] = df['INSTAL_DPD_MEAN'] * df['AMT_ANNUITY']
#     df['IAX_DPD__INC'] = df['INSTAL_DPD_MEAN'] * df['AMT_INCOME_TOTAL']
#     df['AIO_ANNUITY__DPD'] = df['AMT_ANNUITY'] / df['INSTAL_DPD_MEAN']
#     df['AIO_INC__DPD'] = df['AMT_INCOME_TOTAL'] / df['INSTAL_DPD_MEAN']
#     df['IBX_DPD_CREDIT_MAX_OVERDUE'] = df['INSTAL_DPD_MEAN'] * df['BURO_AMT_CREDIT_MAX_OVERDUE_MEAN']
#     df['IBM_DPD_CREDIT_MAX_OVERDUE'] = df['INSTAL_DPD_MEAN'] - df['BURO_AMT_CREDIT_MAX_OVERDUE_MEAN']
    # df['BAO_CREDIT_MAX_OVERDUE__ANNUITY'] = df["ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN"] / df['AMT_ANNUITY']
    # df['BAO_CREDIT_MAX_OVERDUE__INC'] = df["ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN"] / df['AMT_INCOME_TOTAL']
    # df["IAO_PAYMENT__CREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_CREDIT']
    # df["IAO_PAYMENT__GOODS"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_GOODS_PRICE']
    # df['CAO_CREDIT__ANNUITY'] = df["CLOSED_AMT_CREDIT_SUM_MEAN"] / df['AMT_ANNUITY']
    # df['CAO_CREDIT__INC'] = df["CLOSED_AMT_CREDIT_SUM_MEAN"] / df['AMT_INCOME_TOTAL']
    # df['CAO_CREDIT__CREDIT'] = df["CLOSED_AMT_CREDIT_SUM_MEAN"] / df['AMT_CREDIT']
    # df['CAO_CREDIT__GOODS'] = df["CLOSED_AMT_CREDIT_SUM_MEAN"] / df['AMT_GOODS_PRICE']
    
    return df

In [5]:
with timer('Load features'):
    df = load_all('../data/preprocess')
    print("df shape:", df.shape)

df shape: (356251, 877)
[92m[Done] Load features in 1:49 (2018-08-21 21:54:16.385110)[0m


In [206]:
df_exp = df.copy()
df_exp = exp_feats(df)
print("df_exp shape:", df_exp.shape)

df_exp shape: (356251, 906)


In [207]:
df_full = df_exp.copy()

In [208]:
df_full['/:IAO_PAYMENT__ANNUITY:PREV_APPROVED_CNT_PAYMENT_MEAN'] = df_full['IAO_PAYMENT__ANNUITY']/df_full['PREV_APPROVED_CNT_PAYMENT_MEAN']
df_full['/:PREV_APPROVED_CNT_PAYMENT_MEAN:INSTAL_DAYS_ENTRY_PAYMENT_MAX'] = df_full['PREV_APPROVED_CNT_PAYMENT_MEAN']/df_full['INSTAL_DAYS_ENTRY_PAYMENT_MAX']

In [209]:
train_df = df_full[df_full['TARGET'].notnull()]
test_df = df_full[df_full['TARGET'].isnull()]
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

train_df shape: (307507, 908)
test_df shape: (48744, 908)


In [219]:
# base = '106_m_lgbm_03'
base = '91_mb_lgbm5_04'
model_folder = os.path.join('..', 'expmodel', base)
feats = load_feats(model_folder, 'feats.json')
# drop_feats = []
if feats is not None:
    print('Load feats from {}'.format(model_folder))
    drop_feats = list(set(df_full.columns) - set(feats))
else:
    skip_feats = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] + drop_feats
    feats = [f for f in df_full.columns if f not in skip_feats]
trying_feats = [
    'IAX_DPD__ANNUITY',
#     'IAX_DPD__INC',
#     'AIO_ANNUITY__DPD',
#     'AIO_INC__DPD',
#     'IBX_DPD_CREDIT_MAX_OVERDUE',
    'IBM_DPD_CREDIT_MAX_OVERDUE',
    # 'BAO_CREDIT_MAX_OVERDUE__ANNUITY', 'BAO_CREDIT_MAX_OVERDUE__INC'
]
feats += trying_feats
print('feats num: {}, drop_feats: {}'.format(len(feats), len(drop_feats)))

Load feats from ../expmodel/91_mb_lgbm5_04
feats num: 653, drop_feats: 257


In [220]:
X_train = train_df[feats]
X_test = test_df[feats]
y_train = train_df['TARGET']
print("X_train df shape:", X_train.shape)

X_train df shape: (307507, 653)


In [9]:
def feature_importance(n_fold, clf, feats):
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold+1
    return fold_importance_df

In [221]:
import importlib
importlib.reload(ModelWrapper)

r = 9
nums_fold = 3
prefix = '106_{}_lgbm{}_{}'.format('m', nums_fold, str(r).zfill(2))
# prefix = '91_mb_lgbm5_04'
model_folder = os.path.join('..', 'expmodel', prefix)

random_state = 1001
params = {}
# params = {
#     'colsample_bytree': 0.7642713022545912,
#     'learning_rate': 0.010414356455472869,
#     'num_leaves': 33.01866087439472,
#     'subsample': 0.9695097184671164,
#     'max_depth': 10.865363258675153,
#     'reg_alpha': 0.046479938861221405,
#     'reg_lambda': 0.06363252146721826,
#     'min_split_gain': 0.021696909262842037,
#     'min_child_weight': 40.0454581921338
# }
params = {
  "colsample_bytree": 0.7613328838991742,
  "learning_rate": 0.010362683637634058,
  "num_leaves": 35,
  "subsample": 0.7911882154304066,
  "max_depth": 10,
  "reg_alpha": 0.04764317714601435,
  "reg_lambda": 0.06573583739775202,
  "min_split_gain": 0.027665712181798835,
  "min_child_weight": 37.78459404044409,
  "n_estimators": 10000,
}
params['n_estimators'] = 10000
params['max_depth'] = int(params['max_depth'])
params['num_leaves'] = int(params['num_leaves'])

fit_params = {
    'eval_metric': 'auc',
    'early_stopping_rounds': 200,
    'verbose': 1000,
}

folds = KFold(n_splits=nums_fold, shuffle=True, random_state=random_state)
model = ModelWrapper.ModelWrapper(
    CLF=LGBMClassifier,
    name="LightGBM",
    model_folder=model_folder,
    feats=feats,
    drop_feats=drop_feats,
    params=params,
    fit_params=fit_params
)

In [222]:
n_fold = 0
feature_importance_df = pd.DataFrame()
for clf, fold_auc in model.folds_train(folds, X_train, y_train, X_test):
    highlight_print(Fore.LIGHTBLUE_EX, '- %.6f (%s)' % (fold_auc, datetime.datetime.now()))
    fold_importance_df = feature_importance(n_fold, clf, feats)
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    n_fold += 1
    del clf, fold_importance_df
    gc.collect()

score = roc_auc_score(y_train, model.oof_preds_df)
highlight_print(Fore.RED, '## %s: %.6f' % (model.name, score))
model.scores.append(score)
model.serialize_scores()

feats num: 653
model folder: ../expmodel/106_m_lgbm3_09
../expmodel/106_m_lgbm3_09/LightGBM_0.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.78762
[2000]	valid_0's auc: 0.7904
Early stopping, best iteration is:
[2583]	valid_0's auc: 0.7908
[94m- 0.790800 (2018-08-26 09:26:09.663660)[0m
../expmodel/106_m_lgbm3_09/LightGBM_1.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.787382
[2000]	valid_0's auc: 0.790703
Early stopping, best iteration is:
[2739]	valid_0's auc: 0.791064
[94m- 0.791064 (2018-08-26 09:44:52.153104)[0m
../expmodel/106_m_lgbm3_09/LightGBM_2.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.789557
[2000]	valid_0's auc: 0.792432
[3000]	valid_0's auc: 0.792775
Early stopping, best iteration is:
[2863]	valid_0's auc: 0.792814
[94m- 0.792814 (2018-08-26 

In [225]:
feature_importance_df = calculate_feature_importance(feature_importance_df, model_folder)

In [None]:
feature_importance_df

In [223]:
b_round = 0
b_feature_importance_df = pd.DataFrame()

n_iter = 5
n_splits = 3
prefix = '106_{}_lgbm{}_{}_{}'.format('b', nums_fold, n_iter, str(r).zfill(2))
model_folder = os.path.join('..', 'expmodel', prefix)

def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    global b_feature_importance_df
    global b_round

    params['n_estimators'] = 10000
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
    # params['max_bin'] = 1023
    # params['boosting'] = 'dart'
    # params['device_type'] = 'gpu'
    # params['gpu_use_dp'] = True
    clf = LGBMClassifier(**params)
    model_subfolder = os.path.join(model_folder, str(b_round))
    try:
        os.makedirs(model_folder)
    except:
        pass
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=1001)
    b_model = ModelWrapper.ModelWrapper(
        CLF=LGBMClassifier,
        name="LightGBM",
        model_folder=model_subfolder,
        feats=feats,
        drop_feats=drop_feats,
        params=params,
        fit_params=fit_params
    )
    n_fold = 0
    for clf, fold_auc in b_model.folds_train(folds, X_train, y_train, X_test):
        fold_importance_df = feature_importance(n_fold, clf, feats)
        b_feature_importance_df = pd.concat([b_feature_importance_df, fold_importance_df], axis=0)
        n_fold += 1
        del clf, fold_importance_df
        gc.collect()

    score = roc_auc_score(y_train, b_model.oof_preds_df)
    b_model.scores.append(score)
    b_model.serialize_scores()
    b_round += 1
    return score

with timer("BayesianOptimization:"):
    b_params = {
        'colsample_bytree': (0.75, 1),
        'learning_rate': (.01, .02),
        'num_leaves': (33, 36),
        'subsample': (0.75, 1),
        'max_depth': (6, 11),
        'reg_alpha': (.0275, .05),
        'reg_lambda': (.06, .08),
        'min_split_gain': (.0075, .03),
        'min_child_weight': (37.75, 40.25)
    }
    bo = BayesianOptimization(lgbm_evaluate, b_params)
    bo.maximize(init_points=5, n_iter=4)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
feats num: 653
model folder: ../expmodel/106_b_lgbm3_5_09/0
../expmodel/106_b_lgbm3_5_09/0/LightGBM_0.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.788894
[2000]	valid_0's auc: 0.789764
Early stopping, best iteration is:
[1839]	valid_0's auc: 0.789874
../expmodel/106_b_lgbm3_5_09/0/LightGBM_1.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.788669
[2000]	valid_0's auc: 0.790069
Early stopping, best iteration is:
[2395]	valid_0's auc: 0.790407
../expmo

feats num: 653
model folder: ../expmodel/106_b_lgbm3_5_09/7
../expmodel/106_b_lgbm3_5_09/7/LightGBM_0.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.789402
Early stopping, best iteration is:
[1710]	valid_0's auc: 0.790883
../expmodel/106_b_lgbm3_5_09/7/LightGBM_1.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.789266
[2000]	valid_0's auc: 0.790941
Early stopping, best iteration is:
[2140]	valid_0's auc: 0.791007
../expmodel/106_b_lgbm3_5_09/7/LightGBM_2.pickle not exists, going to train.
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's auc: 0.791018
[2000]	valid_0's auc: 0.792643
Early stopping, best iteration is:
[2112]	valid_0's auc: 0.792696
    8 | 40m46s | [35m   0.79150[0m | [32m            0.7763[0m | [32m         0.0146[0m | [32m    10.9094[0m | [32m           39.9354[0m | [32m          0.0

In [215]:
cols = b_feature_importance_df[["feature", "importance"]]\
    .groupby("feature")\
    .mean()\
    .sort_values(by="importance", ascending=False)

with open(os.path.join(model_folder, 'b_feats_importance.json'), 'w') as f:
    d = dict((i, r) for i, r in cols['importance'].iteritems())
    json.dump(d, f, indent=2)

with open(os.path.join(model_folder, 'params.json'), 'w') as f:
    json.dump(bo.res['max'], f, indent=2)

bo.res['max'], model_folder

({'max_val': 0.7915209761073426,
  'max_params': {'colsample_bytree': 0.7694740743316772,
   'learning_rate': 0.01310154058346192,
   'num_leaves': 35.80256673316539,
   'subsample': 0.7555353463722276,
   'max_depth': 10.66603361173349,
   'reg_alpha': 0.029991443322690534,
   'reg_lambda': 0.07708551802836272,
   'min_split_gain': 0.011336053625914954,
   'min_child_weight': 40.24016817690307}},
 '../expmodel/106_b_lgbm5_5_08')

In [None]:
"""
## LightGBM: 0.793113, 0.796
- 0.795078 (2018-08-24 21:50:48.978086)
- 0.791264 (2018-08-24 21:50:51.429607)
- 0.791956 (2018-08-24 21:50:53.210522)
- 0.791788 (2018-08-24 21:50:55.030508)
- 0.795513 (2018-08-24 21:50:56.886642)
"""

In [161]:
submit(prefix, test_df, model.test_preds_df)

../data/submission/91_mb_lgbm5_04.csv


In [226]:
for i, r in feature_importance_df['importance'].iteritems():
#     if r == 0:
#        print("'{}',".format(i))
        print("'{}':'{}',".format(i, r))

'AA_SOURCES_MEAN':'1101.3333333333333',
'DAYS_BIRTH':'1038.0',
'/:IAO_PAYMENT__ANNUITY:PREV_APPROVED_CNT_PAYMENT_MEAN':'960.0',
'AAO_ANNUITY__CREDIT':'859.0',
'ACTIVE_DEBT_CREDIT_RATIO':'852.3333333333334',
'AAO_CREDIT__ANNUITY':'845.0',
'EXT_SOURCE_3':'816.0',
'EXT_SOURCE_2':'784.0',
'AAO_CREDIT__GOODS':'755.3333333333334',
'DAYS_ID_PUBLISH':'737.6666666666666',
'/:PREV_APPROVED_CNT_PAYMENT_MEAN:INSTAL_DAYS_ENTRY_PAYMENT_MAX':'697.0',
'EXT_SOURCE_1':'651.6666666666666',
'ACTIVE_DAYS_CREDIT_MAX':'634.6666666666666',
'AAAX_CREDIT_ANNUITY__SOURCE_3':'634.0',
'AMT_ANNUITY':'624.6666666666666',
'AAAX_SOURCES_MEAN__EMPLOYED':'607.0',
'REGION_POPULATION_RELATIVE':'600.3333333333334',
'DAYS_REGISTRATION':'575.6666666666666',
'AAO_ANNUITY__INC':'558.3333333333334',
'ACTIVE_DAYS_CREDIT_ENDDATE_MIN':'550.6666666666666',
'AAAX_SOURCES_MEAN__BIRTH':'545.0',
'PREV_APPROVED_CNT_PAYMENT_MEAN':'536.0',
'IAX_DPD__ANNUITY':'527.6666666666666',
'AMT_GOODS_PRICE':'509.6666666666667',
'CLOSED_DAYS_CREDIT_M