In [1]:
import datetime
import gc
import os
import time
import numpy as np
import pandas as pd
import json

from colorama import Fore, Style

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from bayes_opt import BayesianOptimization

from utils import ModelWrapper
from utils.preprocess import load_all
from utils.utils import (
    highlight_print, timer, submit, calculate_feature_importance, load_feats,
)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def exp_feats(df):
    df['BAO_CREDIT__ANNUNITY'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_ANNUITY']
    df['BAO_CREDIT__INC'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_INCOME_TOTAL']
    df['BAO_CREDIT__CREDIT'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_CREDIT']
    df['BAO_CREDIT__GOODS'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_GOODS_PRICE']
    
    df['PAO_CREDIT__ANNUNITY'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_ANNUITY']
    df['PAO_CREDIT__INC'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_CREDIT__CREDIT'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_CREDIT']
    df['PAO_CREDIT__GOODS'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_GOODS_PRICE']
    
    df['PAO_ANNUNITY'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_ANNUITY']
    df['PAO_ANNUNITY__INC'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_ANNUNITY__CREDIT'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_CREDIT']
    
    df["IBO_PAYMENT__ACREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['ACTIVE_AMT_CREDIT_SUM_SUM']
    df["IBO_PAYMENT__CCREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['CLOSED_AMT_CREDIT_SUM_SUM']
    df["IAO_PAYMENT__ANNUITY"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_ANNUITY']
    df["IAO_PAYMENT__INC"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_INCOME_TOTAL']
    
    return df

In [3]:
with timer('Load features'):
    df = load_all('../data/preprocess')
    print("df shape:", df.shape)

df shape: (356251, 951)
[92m[Done] Load features at 2018-08-27 17:29:58.483796[0m


In [4]:
df_exp = df.copy()
df_exp = exp_feats(df)
print("df_exp shape:", df_exp.shape)

df_exp shape: (356251, 966)


In [5]:
df_full = df_exp.copy()

In [6]:
base = '91_m_lgbm5_best'
model_folder = os.path.join('..', 'expmodel', base)
feats = load_feats(model_folder, 'feats.json')
if feats is not None:
    print('Load feats from: {}'.format(base))
    drop_feats = list(set(df_full.columns) - set(feats))
else:
    skip_feats = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] + drop_feats
    feats = [f for f in df_full.columns if f not in skip_feats]
print('feats num: {}, drop_feats: {}'.format(len(feats), len(drop_feats)))

Load feats from: 91_m_lgbm5_best
feats num: 696, drop_feats: 317


In [9]:
df_full, selected_generated_feats = process_generated_feats(df_full, feats)
df_full[feats]
print('df_full shape: {}, selected_generated_feats: {}'.format(
    df_full.shape, len(selected_generated_feats)))
df_gen = df_full.copy()
print('df_gen shape: {}, feats num: {}'.format(df_gen.shape, len(feats)))



df_full shape: (356251, 1013), selected_generated_feats: 47


In [11]:
train_df = df_gen[df_gen['TARGET'].notnull()]
test_df = df_gen[df_gen['TARGET'].isnull()]
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

X_train = train_df[feats]
X_test = test_df[feats]
y_train = train_df['TARGET']
print("X_train df shape:", X_train.shape)

train_df shape: (307507, 1013)
test_df shape: (48744, 1013)
X_train df shape: (307507, 696)


In [12]:
def feature_importance(n_fold, clf, feats):
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold+1
    return fold_importance_df

In [42]:
import importlib
importlib.reload(ModelWrapper)

r = 1
nums_fold = 5
random_state = 1001
prefix = '106_{}_cat{}_{}'.format('m', nums_fold, str(r).zfill(2))
model_folder = os.path.join('..', 'expmodel', '{}'.format(prefix))
params = {}
params = {
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False,
    'verbose': True
}
params['iterations'] = 10000
params['depth'] = int(params['depth'])
params['verbose'] = True

fit_params = {}
fit_params = {
    'early_stopping_rounds': 200,
    'verbose': 1000,
}
model = ModelWrapper.ModelWrapper(
    CLF=CatBoostClassifier,
    name="CatBoost",
    model_folder=model_folder,
    feats=feats,
    drop_feats=drop_feats,
    params=params,
    fit_params=fit_params
)

In [34]:
n_fold = 0
random_state = 1001
feature_importance_df = pd.DataFrame()
folds = KFold(n_splits=nums_fold, shuffle=True, random_state=random_state)
for clf, fold_auc in model.folds_train(folds, X_train, y_train, X_test):
    highlight_print(Fore.LIGHTBLUE_EX, '- %.6f (%s)' % (fold_auc, datetime.datetime.now()))
    # fold_importance_df = feature_importance(n_fold, clf, feats)
    # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    n_fold += 1
    del clf #, fold_importance_df
    gc.collect()
        
score = roc_auc_score(y_train, model.oof_preds_df)
highlight_print(Fore.RED, '## CatBoost: %.6f' % score)
model.scores.append(score)
model.serialize_scores()

feats num: 696
model folder: ../expmodel/106_m_cat5_01
[94m- 0.785062 (2018-08-27 20:46:17.714025)[0m
[94m- 0.782466 (2018-08-27 20:46:19.929661)[0m
[94m- 0.785878 (2018-08-27 20:46:21.704855)[0m
[94m- 0.784271 (2018-08-27 20:46:23.840220)[0m
[94m- 0.785791 (2018-08-27 20:46:25.808593)[0m
[31m## CatBoost: 0.784571[0m


In [None]:
"""
- 0.785062 (2018-08-27 20:46:17.714025)
- 0.782466 (2018-08-27 20:46:19.929661)
- 0.785878 (2018-08-27 20:46:21.704855)
- 0.784271 (2018-08-27 20:46:23.840220)
- 0.785791 (2018-08-27 20:46:25.808593)
## CatBoost: 0.784571
"""

In [44]:
n_iter = 5
n_splits = 5
prefix = '106_{}_cat{}_{}_{}'.format('bx', n_splits, n_iter, str(r).zfill(2))
model_folder = os.path.join('..', 'expmodel', prefix)

b_round = 0
b_feature_importance_df = pd.DataFrame()

def cat_evaluate(**params):
    warnings.simplefilter('ignore')

    global b_feature_importance_df
    global b_round

    params['n_estimators'] = 10000
    params['depth'] = int(params['depth'])
    params['bootstrap_type'] = 'Bernoulli'
    params['eval_metric'] = 'AUC'
    params['od_type'] = 'Iter'
    params['allow_writing_files'] = False
    params['verbose'] = True

    submodel_folder = os.path.join(model_folder, str(b_round))
    try:
        os.makedirs(submodel_folder)
    except:
        pass
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=1001)
    # print(fit_params)
    b_model = ModelWrapper.ModelWrapper(
        CLF=CatBoostClassifier,
        name="CatBoost",
        model_folder=submodel_folder,
        feats=feats,
        drop_feats=drop_feats,
        params=params,
        fit_params=fit_params
    )
    n_fold = 0
    for clf, fold_auc in b_model.folds_train(folds, X_train, y_train, X_test):
        # fold_importance_df = feature_importance(n_fold, clf, feats)
        # b_feature_importance_df = pd.concat([b_feature_importance_df, fold_importance_df], axis=0)
        n_fold += 1
        del clf# , fold_importance_df
        gc.collect()

    score = roc_auc_score(y_train, b_model.oof_preds_df)
    b_model.scores.append(score)
    b_model.serialize_scores()
    b_round += 1
    return score

with timer("BayesianOptimization:"):
    b_params = {
        'learning_rate': (0.1, 0.5),
        'depth': (3, 6),
        'l2_leaf_reg': (38, 42),
        'subsample': (0.7, 1),
        'scale_pos_weight': (4.5, 5.5),
    }
    bo = BayesianOptimization(cat_evaluate, b_params)
    bo.maximize(init_points=5, n_iter=n_iter)

[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     depth |   l2_leaf_reg |   learning_rate |   scale_pos_weight |   subsample | 
feats num: 696
model folder: ../expmodel/106_bx_cat5_5_01/0
../expmodel/106_bx_cat5_5_01/0/CatBoost_0.pickle not exists, going to train.
0:	test: 0.7046939	best: 0.7046939 (0)	total: 773ms	remaining: 2h 8m 51s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.789184678
bestIteration = 248

Shrink model to first 249 iterations.
../expmodel/106_bx_cat5_5_01/0/CatBoost_1.pickle not exists, going to train.
0:	test: 0.6979556	best: 0.6979556 (0)	total: 841ms	remaining: 2h 20m 5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.784194672
bestIteration = 331

Shrink model to first 332 iterations.
../expmodel/106_bx_cat5_5_01/0/CatBoost_2.pickle not exists, going to train.
0:	test: 0.7005832	best: 0.7005

    5 | 73m55s | [35m   0.78906[0m | [32m   5.0938[0m | [32m      38.7235[0m | [32m         0.1589[0m | [32m            4.7709[0m | [32m     0.9732[0m | 
[31mBayesian Optimization[0m
[94m--------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     depth |   l2_leaf_reg |   learning_rate |   scale_pos_weight |   subsample | 
feats num: 696
model folder: ../expmodel/106_bx_cat5_5_01/5
../expmodel/106_bx_cat5_5_01/5/CatBoost_0.pickle not exists, going to train.
0:	test: 0.7058863	best: 0.7058863 (0)	total: 536ms	remaining: 1h 29m 23s
1000:	test: 0.7909952	best: 0.7911148 (990)	total: 9m 20s	remaining: 1h 23m 57s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7912441057
bestIteration = 1090

Shrink model to first 1091 iterations.
../expmodel/106_bx_cat5_5_01/5/CatBoost_1.pickle not exists, going to train.
0:	test: 0.7041555	best: 0.7041555 (0)	total: 566ms	remaining:

    9 | 122m21s | [35m   0.79067[0m | [32m   3.0602[0m | [32m      38.1033[0m | [32m         0.1109[0m | [32m            5.2010[0m | [32m     0.9770[0m | 
feats num: 696
model folder: ../expmodel/106_bx_cat5_5_01/9
../expmodel/106_bx_cat5_5_01/9/CatBoost_0.pickle not exists, going to train.
0:	test: 0.7045031	best: 0.7045031 (0)	total: 1.26s	remaining: 3h 30m 7s
1000:	test: 0.7918278	best: 0.7918423 (993)	total: 16m 26s	remaining: 2h 27m 48s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7923972496
bestIteration = 1581

Shrink model to first 1582 iterations.
../expmodel/106_bx_cat5_5_01/9/CatBoost_1.pickle not exists, going to train.
0:	test: 0.6979556	best: 0.6979556 (0)	total: 1.36s	remaining: 3h 46m 14s
1000:	test: 0.7868569	best: 0.7868583 (998)	total: 17m 24s	remaining: 2h 36m 27s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7880187591
bestIteration = 1477

Shrink model to first 1478 iterations.
../expmodel/106_bx_cat5_5_01

In [45]:
bo.res['max']

{'max_val': 0.7909412030768752,
 'max_params': {'learning_rate': 0.10506322757096559,
  'depth': 3.1053539767594875,
  'l2_leaf_reg': 38.12140743351985,
  'subsample': 0.9375666413595538,
  'scale_pos_weight': 5.4535391282021015}}

In [32]:
"""
Shrink model to first 709 iterations.
    1 | 83m43s |    0.78989 |    3.0757 |       38.2374 |          0.1991 |             5.1549 |      0.7042 | 
feats num: 696
"""
feats[106]
col = 'AAO_PHONE__BIRTH'
X_train[[col]] = X_train[[col]].fillna(X_train[col].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [8]:
def process_generated_feats(df, feats):
    sg = {
        '+': [], '-': [], '*': [], '/': [], '^':[]
    }
    x_dup_combos = [
        ('AAAO_SOURCES_MEAN__EMPLOYED_BIRTH', 'AAO_EMPLOYED__BIRTH'),
    ]
    o_dup_combos = [
        ('AAAX_CREDIT_ANNUITY__EMPLOYED', 'DAYS_EMPLOYED'),
        ('AAO_EMPLOYED__BIRTH', 'DAYS_EMPLOYED'),
        ('AAO_ANNUITY__INC', 'AAO_CREDIT__INC'),
    ]
    for feat in feats:
        if feat.startswith('*:') or \
            feat.startswith('/:') or \
            feat.startswith('+:') or \
            feat.startswith('-:') or \
            feat.startswith('^:'):
            operator = feat[0]
            f1, f2 = feat[2:].split('-')
            if operator == '*' and (f1, f2) in x_dup_combos:
                continue
            if operator == '/' and (f1, f2) in o_dup_combos:
                continue
            sg[operator].append((f1, f2))
    operators = {
         '*': np.multiply,
         '/': np.divide,
         '+': np.add,
         '-': np.subtract,
         '^': np.power
    }
    generated_feats = []
    for k, v in sg.items():
        for (f1, f2) in v:
            name = '{}:{}-{}'.format(k, f1, f2)
            generated_feats.append(name)
            df[name] = operators[k](df[f1], df[f2])
    return df, generated_feats

In [None]:
# -- lightgbm_10xx --
"""
# KFold 5

** Local: 0.792067, Submit: 0.796
LightGBM: 0.793998
LightGBM: 0.790757
LightGBM: 0.790989
LightGBM: 0.790886
LightGBM: 0.793755
- model/lgbm_10xx_5_1

* Local: 0.792063
LightGBM: 0.794158
LightGBM: 0.790051
LightGBM: 0.791403
LightGBM: 0.791301
LightGBM: 0.793603
- model/lgbm_10xpb_5_3


* Local: 0.791866
LightGBM: 0.794408
LightGBM: 0.790046
LightGBM: 0.791016
LightGBM: 0.790964
LightGBM: 0.792973
model/lgbm_10xpb_5_2
"""

# -- lightgbm_102 --
"""
## 0.791673(local), 0.796(submit) **
- lgmb_102_5_3
- KFold 5
"""

In [15]:
for i, r in cols['importance'].iteritems():
    if r == 0:
        print("'{}',".format(i))
        # print("'{}':'{}',".format(i, r))

'WALLSMATERIAL_MODE_Block',
'FLAG_OWN_CAR',
'CC_AMT_TOTAL_RECEIVABLE_MIN',
'ORGANIZATION_TYPE_Business Entity Type 1',
'FONDKAPREMONT_MODE_reg oper account',
'WALLSMATERIAL_MODE_Panel',
'BASEMENTAREA_MODE',
'WALLSMATERIAL_MODE_Wooden',
'CC_NAME_CONTRACT_STATUS_Active_MIN',
'FLAG_EMAIL',
'BASEMENTAREA_MEDI',
'REG_CITY_NOT_WORK_CITY',
'ORGANIZATION_TYPE_Trade: type 7',
'WEEKDAY_APPR_PROCESS_START_THURSDAY',
'ACTIVE_MONTHS_BALANCE_MAX_MAX',
'ORGANIZATION_TYPE_Government',
'FONDKAPREMONT_MODE_org spec account',
'CC_NAME_CONTRACT_STATUS_Active_MEAN',
'ELEVATORS_MODE',
'FLOORSMAX_MEDI',
'POS_COUNT',
'OCCUPATION_TYPE_Security staff',
'HOUSETYPE_MODE_block of flats',
'PREV_NAME_GOODS_CATEGORY_Medical Supplies_MEAN',
'PREV_CODE_REJECT_REASON_CLIENT_MEAN',
'PREV_NAME_GOODS_CATEGORY_Office Appliances_MEAN',
'LIVINGAPARTMENTS_MODE',
'NAME_CONTRACT_TYPE_Cash loans',
'NAME_CONTRACT_TYPE_Revolving loans',
'CLOSED_MONTHS_BALANCE_MAX_MAX',
'NAME_FAMILY_STATUS_Civil marriage',
'NAME_HOUSING_TYPE_With pa