In [6]:
import datetime
import gc
import os
import time
import numpy as np
import pandas as pd
import pickle
import json

from colorama import Fore, Style

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, plot_importance
from bayes_opt import BayesianOptimization

from utils import ModelWrapper, XGBWrapper
from utils.utils import (
    highlight_print, timer, submit, calculate_feature_importance, load_feats,
)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
def preprocess():
    _train = pd.read_csv('../data/raw/application_train.csv')
    _test = pd.read_csv('../data/raw/application_test.csv')
    print("Train samples: {}, test samples: {}".format(len(_train), len(_test)))
    df = _train.append(_test, sort=True).reset_index()

    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    return train_df, test_df

In [8]:
def predict_base_learners(pred_base_learners, x_shape, preds_key, val, y=None, n_fold=-1):
    P = np.zeros((x_shape[0], len(pred_base_learners)))
    for i, model in enumerate(pred_base_learners):
        if preds_key == 'oof_preds':
            preds_filename = os.path.join(model['model_folder'], 'oof_preds_{}.npy'.format(n_fold))
        else:
            preds_filename = os.path.join(model['model_folder'], 'test_preds_{}.npy'.format(n_fold))
        p = np.load(preds_filename)
        P[:, i] = p
        if preds_key == 'oof_preds':
            model[preds_key][val] = p
        else:
            model[preds_key] += p/val

#         if y is not None:
#             highlight_print(Fore.LIGHTBLUE_EX, '%s: %.6f' % (model['name'], roc_auc_score(y, p)))
    return P

In [9]:
def ensemble_predict(base_learners, meta_learner, X):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, X.shape)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [10]:
# https://www.dataquest.io/blog/introduction-to-ensembles/
from sklearn.base import clone
def stacking_cv(base_learners, folds, X_train, y_train, X_test):

    for model in base_learners:
        model['oof_preds'] = np.zeros(X_train.shape[0])
        model['test_preds'] = np.zeros(X_test.shape[0])

    X_cv, y_cv = [], []
    highlight_print(Fore.LIGHTBLUE_EX, "Blending {} models:".format(len(base_learners)))
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train)):
        fold_x_train, fold_y_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
        fold_x_valid, fold_y_valid = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

        # Predict for fold auc score and meta_learner's input
        fold_P_base = predict_base_learners(
            base_learners, fold_x_valid.shape, 'oof_preds', valid_idx, fold_y_valid, n_fold=n_fold
        )
        X_cv.append(fold_P_base)
        y_cv.append(fold_y_valid)

        # Predict for final result
        predict_base_learners(base_learners, X_test.shape, 'test_preds', folds.n_splits, n_fold=n_fold)

    for model in base_learners:
        score = roc_auc_score(y_train, model['oof_preds'])
        highlight_print(Fore.RED, '- %s: %.6f' % (model['name'], score))
    
    X_cv = np.vstack(X_cv)
    y_cv = np.hstack(y_cv)
    base_test_preds = np.concatenate(
        tuple(model['test_preds'].reshape(-1, 1) for model in base_learners), axis=1)

    return X_cv, y_cv, base_test_preds

In [11]:
train_df, test_df = preprocess()
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

Train samples: 307511, test samples: 48744
train_df shape: (307507, 123)
test_df shape: (48744, 123)


In [12]:
models = [
    {
        'name': 'LightGBM',
        'model_folder': '../expmodel/91_m_lgbm5_best',
    },
    {
        'name': 'XGBoost',
        'model_folder': '../expmodel/xgb_10x_3',
    },
    {
        'name': 'CatBoost',
        'model_folder': '../expmodel/91_m_cat5_01',
    },
    {
        'name': 'RandomForest',
        'model_folder': '../expmodel/91_m_rf5_02',
    },
    {
        'name': 'LogisticRegression',
        'model_folder': '../expmodel/91_m_lr5_best',
    },
#     {
#         'name': 'NN',
#     }
]

num_folds = 5
folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
X_cv, y_cv, base_test_preds = stacking_cv(models, folds, train_df, train_df['TARGET'], test_df)

[94mBlending 5 models:[0m
[31m- LightGBM: 0.793697[0m
[31m- XGBoost: 0.793660[0m
[31m- CatBoost: 0.791038[0m
[31m- RandomForest: 0.758536[0m
[31m- LogisticRegression: 0.768613[0m


In [13]:
X_df = pd.DataFrame(X_cv, index=range(X_cv.shape[0]))
y_df = pd.Series(y_cv)
base_test_df = pd.DataFrame(base_test_preds)

In [15]:
z_oof_preds = np.zeros(X_df.shape[0])
z_test_preds = np.zeros(base_test_df.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_df)):
    fold_x_train, fold_y_train = X_df.iloc[train_idx], y_df.iloc[train_idx]
    fold_x_valid, fold_y_valid = X_df.iloc[valid_idx], y_df.iloc[valid_idx]
    clf = XGBClassifier(**params)
    clf.fit(
        fold_x_train, fold_y_train.ravel(),
        eval_set=[(fold_x_valid, fold_y_valid)],
        eval_metric='auc', 
        verbose=False,
        early_stopping_rounds=200
    )
    # validation
    p = clf.predict_proba(fold_x_valid, ntree_limit=clf.best_ntree_limit)[:, 1]
    z_oof_preds[valid_idx] = clf.predict_proba(fold_x_valid)[:, 1]
    highlight_print(
        Fore.LIGHTBLUE_EX,
        '- #%s: %.6f' % (n_fold, roc_auc_score(fold_y_valid, p))
    )
    # prediction
    z_test_preds += clf.predict_proba(base_test_df)[:, 1]/folds.n_splits
    del clf
    gc.collect()

highlight_print(
    Fore.RED, '## Blender: %.6f' % (roc_auc_score(y_cv, z_oof_preds))
)

[94m- #0: 0.796664[0m
[94m- #1: 0.795836[0m
[94m- #2: 0.795861[0m
[94m- #3: 0.790509[0m
[94m- #4: 0.797654[0m
[31m## Blender: 0.794760[0m


In [None]:
r = 1
b_round = 0
n_splits = 3
prefix = '106_b_xgb_blender{}_r'.format(n_splits, str(r).zfill(2))
folder = os.path.join('..', 'expmodel', '{}'.format(prefix))
if not os.path.exists(folder):
    os.makedirs(folder)
        
def xgb_evaluate(**params):
    warnings.simplefilter('ignore')
    
    global b_feature_importance_df
    global b_round
    
    params['n_estimators'] = 10000
    params['max_depth'] = int(params['max_depth'])
    params['n_jobs'] = -1
    clf = XGBClassifier(**params)
    folds = KFold(n_splits=3, shuffle=True, random_state=1001)
    test_pred_proba = np.zeros(train_df.shape[0])
    oof_preds = np.zeros(X_cv.shape[0])
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_df)):
        fold_x_train, fold_y_train = X_df.iloc[train_idx], y_df.iloc[train_idx]
        fold_x_valid, fold_y_valid = X_df.iloc[valid_idx], y_df.iloc[valid_idx]
        clf = XGBClassifier(**params)
        clf.fit(
            fold_x_train, fold_y_train.ravel(),
            eval_set=[(fold_x_valid, fold_y_valid)],
            eval_metric='auc', 
            verbose=False,
            early_stopping_rounds=200
        )
        # validation
        p = clf.predict_proba(fold_x_valid, ntree_limit=clf.best_ntree_limit)[:, 1]
        oof_preds[valid_idx] = p
        b_round += 1

        del fold_x_train, fold_y_train, fold_x_valid, fold_y_valid
        gc.collect()

    return roc_auc_score(y_cv, oof_preds)


with timer("BayesianOptimization:"):
    b_params = {
        'colsample_bytree': (0.8, 1),
        'learning_rate': (.0, .15), 
        'num_leaves': (33, 35), #
        'subsample': (0.75, 0.85), 
        'max_depth': (2, 4), 
        'reg_alpha': (.0, .05), 
        'reg_lambda': (.8, 1), 
        'min_split_gain': (.01, .03), #
        'min_child_weight': (37.5, 38.5) #
    }
    bo = BayesianOptimization(xgb_evaluate, b_params)
    bo.maximize(init_points=5, n_iter=5)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 


In [45]:
import importlib
importlib.reload(XGBWrapper)
importlib.reload(ModelWrapper)

nums_fold = 5
random_state = 1001

params = {}
params['n_jobs'] = -1
params['n_estimators'] = 10000

fit_params = {}
fit_params = {
    'eval_metric': 'auc', 
    'verbose': 1000,
    'early_stopping_rounds': 200
}

r = 1
model_folder = os.path.join('..', 'expmodel', 'm_stacking_xgb_{}'.format(str(r).zfill(2)))
folds = KFold(n_splits=nums_fold, shuffle=True, random_state=random_state)
model = XGBWrapper.XGBWrapper(
    CLF=XGBClassifier,
    name="XGBoostBlender",
    model_folder=model_folder,
    feats=[],
    drop_feats=[],
    params=params,
    fit_params=fit_params
)

n_fold = 0
feature_importance_df = pd.DataFrame()
for clf, fold_auc in model.folds_train(folds, X_df, y_df, base_test_df):
    highlight_print(Fore.LIGHTBLUE_EX, '- %.6f (%s)' % (fold_auc, datetime.datetime.now()))
    n_fold += 1
    del clf
    gc.collect()

score = roc_auc_score(y_cv, model.oof_preds_df)
highlight_print(Fore.RED, '## %s: %.6f' % (model.name, score))
model.scores.append(score)
model.serialize_scores()

feats num: 0
model folder: ../expmodel/m_stacking_xgb_01
[94m- 0.795689 (2018-08-28 09:35:32.326754)[0m
[94m- 0.793602 (2018-08-28 09:35:32.454463)[0m
[94m- 0.793733 (2018-08-28 09:35:32.594618)[0m
[94m- 0.788014 (2018-08-28 09:35:32.720201)[0m
[94m- 0.797053 (2018-08-28 09:35:32.858999)[0m
[31m## XGBoostBlender: 0.792592[0m


[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   learning_rate |   max_depth |   min_child_weight |   min_split_gain |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
[0]	validation_0-auc:0.785511
Will train until validation_0-auc hasn't improved in 200 rounds.
Stopping. Best iteration:
[38]	validation_0-auc:0.794875

[0]	validation_0-auc:0.78452
Will train until validation_0-auc hasn't improved in 200 rounds.
Stopping. Best iteration:
[64]	validation_0-auc:0.794045

    1 | 00m53s | [35m   0.79346[0m | [32m            0.8725[0m | [32m         0.1022[0m | [32m     3.1642[0m | [32m           37.8453[0m | [32m          0.0298[0m | [32m     33.3997[0m | [32m     0.0148[0m | [32m      0.8236[0m | [32m     0.7723[0m | 
[0]	validation_0-auc:0.7847

In [36]:
bo.res['max']

{'max_val': 0.7943772744528415,
 'max_params': {'colsample_bytree': 0.802213237633085,
  'learning_rate': 0.0067975582229254814,
  'num_leaves': 34.75291320121073,
  'subsample': 0.7761709648365778,
  'max_depth': 3.539833520345912,
  'reg_alpha': 0.048048882197007356,
  'reg_lambda': 0.9811300424628693,
  'min_split_gain': 0.018731692024530226,
  'min_child_weight': 37.70474827137932}}

In [None]:
"""
{'max_val': 0.7942660355460254,
 'max_params': {'colsample_bytree': 0.8352291937136631,
  'learning_rate': 0.010079811976522284,
  'num_leaves': 34.16670197670736,
  'subsample': 0.7893716094349154,
  'max_depth': 2.2226346417231815,
  'reg_alpha': 0.03551566886707049,
  'reg_lambda': 0.8585271482835403,
  'min_split_gain': 0.0239970903959675,
  'min_child_weight': 37.769941807264956}}

{'max_val': 0.7942386776844524,
 'max_params': {'colsample_bytree': 0.8172942528381818,
  'learning_rate': 0.010749105645269874,
  'subsample': 0.8016649063877558,
  'max_depth': 2.110984180833438,
  'reg_alpha': 0.005151941589768794,
  'reg_lambda': 0.8107495224202135}}
"""

In [37]:
params = bo.res['max']['max_params']
params['n_jobs'] = -1
params['n_estimators'] = 10000
params['max_depth'] = int(params['max_depth'])
clf = XGBClassifier(**params)

oof_preds = np.zeros(X_cv.shape[0])
test_preds = np.zeros(test_df.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_cv)):
    with timer('XGBoost meta training {}'.format(n_fold)):
        fold_x_train, fold_y_train = X_cv[train_idx.tolist(), :], y_cv.reshape(-1, 1)[train_idx.tolist(), :]
        fold_x_valid, fold_y_valid = X_cv[valid_idx.tolist(), :], y_cv.reshape(-1, 1)[valid_idx.tolist(), :]
        clf.fit(
            fold_x_train, fold_y_train.ravel(),
            eval_set=[(fold_x_valid, fold_y_valid.ravel())],
            eval_metric='auc', 
            verbose=200,
            early_stopping_rounds=200
        )
        # validation
        oof_preds[valid_idx] = clf.predict_proba(fold_x_valid)[:, 1]
        highlight_print(
            Fore.LIGHTBLUE_EX,
            'Meta learner %s: %.6f' % (n_fold, roc_auc_score(fold_y_valid, oof_preds[valid_idx.tolist()]))
        )
        # prediction
        test_preds += clf.predict_proba(base_test_preds)[:, 1]/folds.n_splits

highlight_print(
    Fore.RED, '- Meta learner : %.6f' % (roc_auc_score(y_cv, oof_preds))
)

[0]	validation_0-auc:0.784833
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.795574
[400]	validation_0-auc:0.795889
[600]	validation_0-auc:0.796108
[800]	validation_0-auc:0.796177
[1000]	validation_0-auc:0.796182
[1200]	validation_0-auc:0.796191
Stopping. Best iteration:
[1120]	validation_0-auc:0.796201

[94mMeta learner 0: 0.796180[0m
[92m[Done] XGBoost meta training 0 in 2:59 (2018-08-16 23:38:10.944664)[0m
[0]	validation_0-auc:0.783181
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.794581
[400]	validation_0-auc:0.794868
[600]	validation_0-auc:0.794963
[800]	validation_0-auc:0.794984
Stopping. Best iteration:
[694]	validation_0-auc:0.794997

[94mMeta learner 1: 0.794992[0m
[92m[Done] XGBoost meta training 1 in 2:4 (2018-08-16 23:40:15.481199)[0m
[0]	validation_0-auc:0.773693
Will train until validation_0-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.794289
[400]	validation_0-au

In [None]:
"""
Meta learner 0: 0.796252
Meta learner 1: 0.795085
Meta learner 2: 0.794752
Meta learner 3: 0.789691
Meta learner 4: 0.796570
- Meta learner : 0.794348

Meta learner 0: 0.796164
Meta learner 1: 0.795039
Meta learner 2: 0.794793
Meta learner 3: 0.789682
Meta learner 4: 0.796596
- Meta learner : 0.794330

Meta learner 0: 0.796083
Meta learner 1: 0.794803
Meta learner 2: 0.794691
Meta learner 3: 0.789316
Meta learner 4: 0.796277
- Meta learner : 0.794102
"""

In [None]:
# https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras.callbacks import Callback, EarlyStopping

def auc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [None]:
import keras.layers

oof_preds = np.zeros(X_cv.shape[0])
test_preds = np.zeros(test_df.shape[0])
prefix = 'nn_meta_101'
name = '{}-{date:%Y_%m_%d_%H_%M_%S}'.format(prefix, date=datetime.datetime.now())
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_cv)):
    with timer('NN meta training {}'.format(n_fold)):
        
        fold_x_train, fold_y_train = X_cv[train_idx.tolist(), :], y_cv.reshape(-1, 1)[train_idx.tolist(), :]
        fold_x_valid, fold_y_valid = X_cv[valid_idx.tolist(), :], y_cv.reshape(-1, 1)[valid_idx.tolist(), :]


        model = Sequential()
        model.add(Dense(20, input_dim=fold_x_train.shape[1], activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(15, activation='relu'))
#         model.add(Dense(10, activation='relu'))
#         model.add(Dense(10, activation='relu'))
#         model.add(Dense(10, activation='relu'))
#         model.add(Dense(10, activation='relu'))
#         model.add(Dense(5, activation='relu'))
        model.add(Dense(1, activation="sigmoid", input_shape=(fold_x_train.shape[1],)))

        model.compile(
            loss='binary_crossentropy',
            optimizer='adam',
            # optimizer=keras.optimizers.SGD(lr=0.01, nesterov=True),
            metrics=['accuracy']
        )

        # callbacks = [EarlyStopping(monitor='val_loss', patience=300, verbose=1, mode='max')]
        callbacks = [EarlyStopping(monitor='val_loss', patience=200)]
        model.fit(
            fold_x_train, fold_y_train,
            validation_data=(fold_x_valid, fold_y_valid),
            callbacks=callbacks,
            shuffle=True,
            batch_size=4096*2,
            epochs=400,
            verbose=0,
        )

        # validation
        oof_preds[valid_idx] = model.predict(fold_x_valid)[:, 0]
        highlight_print(
            Fore.LIGHTBLUE_EX,
            'NN blender %s: %.6f' % (n_fold, roc_auc_score(fold_y_valid, oof_preds[valid_idx.tolist()]))
        )

        # prediction
        test_preds += model.predict(base_test_preds)[:, 0]/folds.n_splits
    
highlight_print(
    Fore.RED, '- NN blender : %.6f' % (roc_auc_score(y_cv, oof_preds))
)

In [None]:
"""
# 20*2, 15
- NN blender : 0.794022

# 20*2, 5
- NN blender : 0.794209

# 20*3
- NN blender : 0.794303

# 20*2
- Submit: 0.796
- NN blender : 0.794344

# 10*5, 5
- NN blender : 0.794008

# 20, 10*3, 1
- NN blender : 0.794273

# 4(20*3, 1) layers: add NN and replace with better xgb
NN Meta learner 0: 0.796051
NN Meta learner 1: 0.795375
NN Meta learner 2: 0.794692
NN Meta learner 3: 0.790037
NN Meta learner 4: 0.796729
- NN Meta learner : 0.794231

# 5(20*4, 1) layers; add 1 more layer of 20
NN Meta learner 0: 0.796164
NN Meta learner 1: 0.794285
NN Meta learner 2: 0.793681
NN Meta learner 3: 0.789356
NN Meta learner 4: 0.797046
- NN Meta learner : 0.793909

# 4(20*3, 1) layers; increase epochs from 100 to 400
NN Meta learner 0: 0.795979
NN Meta learner 1: 0.794495
NN Meta learner 2: 0.794147
NN Meta learner 3: 0.789693
NN Meta learner 4: 0.796946
- NN Meta learner : 0.794036
"""

In [None]:
print(name)
test_df['TARGET'] = test_preds
test_df[['SK_ID_CURR', 'TARGET']].to_csv('../submission/{}.csv'.format(name), index= False)

In [None]:
clf=XGBClassifier(**params)
oof_preds = np.zeros(X_cv.shape[0])
test_preds = np.zeros(test_df.shape[0])
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_cv)):
    with timer('XGBoost meta training {}'.format(n_fold)):
        fold_x_train, fold_y_train = X_cv[train_idx.tolist(), :], y_cv.reshape(-1, 1)[train_idx.tolist(), :]
        fold_x_valid, fold_y_valid = X_cv[valid_idx.tolist(), :], y_cv.reshape(-1, 1)[valid_idx.tolist(), :]
        clf.fit(
            fold_x_train, fold_y_train.ravel(),
            eval_set=[(fold_x_train, fold_y_train.ravel()), (fold_x_valid, fold_y_valid.ravel())],
            eval_metric='auc', 
            verbose=200,
            early_stopping_rounds=200
        )
        # validation
        p = clf.predict_proba(fold_x_valid, ntree_limit=clf.best_ntree_limit)[:, 1]
        print(clf.best_iteration, roc_auc_score(fold_y_valid.ravel(), p))
#         oof_preds[valid_idx] = clf.predict_proba(fold_x_valid)[:, 1]
#         highlight_print(
#             Fore.LIGHTBLUE_EX,
#             'Meta learner %s: %.6f' % (n_fold, roc_auc_score(fold_y_valid, oof_preds[valid_idx.tolist()]))
#         )
        # prediction
        test_preds += clf.predict_proba(base_test_preds)[:, 1]/folds.n_splits

# highlight_print(
#     Fore.RED, '- Meta learner : %.6f' % (roc_auc_score(y_cv, oof_preds))
# )
clf.best_iteration