In [4]:
import datetime
import gc
import os
import time
import numpy as np
import pandas as pd
import json

from colorama import Fore, Style

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from bayes_opt import BayesianOptimization

from utils import ModelWrapper, XGBWrapper
from utils.preprocess import load_all
from utils.utils import (
    highlight_print, timer, submit, calculate_feature_importance, load_feats,
)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
def exp_feats(df):
    df['BAO_CREDIT__ANNUNITY'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_ANNUITY']
    df['BAO_CREDIT__INC'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_INCOME_TOTAL']
    df['BAO_CREDIT__CREDIT'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_CREDIT']
    df['BAO_CREDIT__GOODS'] = df['ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_GOODS_PRICE']
    
    df['PAO_CREDIT__ANNUNITY'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_ANNUITY']
    df['PAO_CREDIT__INC'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_CREDIT__CREDIT'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_CREDIT']
    df['PAO_CREDIT__GOODS'] = df['PREV_AMT_CREDIT_MEAN'] / df['AMT_GOODS_PRICE']
    
    df['PAO_ANNUNITY'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_ANNUITY']
    df['PAO_ANNUNITY__INC'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_INCOME_TOTAL']
    df['PAO_ANNUNITY__CREDIT'] = df['PREV_AMT_ANNUITY_MEAN'] / df['AMT_CREDIT']
    
    df["IBO_PAYMENT__ACREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['ACTIVE_AMT_CREDIT_SUM_SUM']
    df["IBO_PAYMENT__CCREDIT"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['CLOSED_AMT_CREDIT_SUM_SUM']
    df["IAO_PAYMENT__ANNUITY"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_ANNUITY']
    df["IAO_PAYMENT__INC"] = df["INSTAL_AMT_PAYMENT_SUM"] / df['AMT_INCOME_TOTAL']
    
    return df

In [6]:
with timer('Load features'):
    df = load_all('../data/preprocess')
    print("df shape:", df.shape)

df shape: (356251, 998)
[92m[Done] Load features at 2018-08-28 11:45:53.451369[0m


In [7]:
df_exp = df.copy()
df_exp = exp_feats(df)
print("df_exp shape:", df_exp.shape)

df_exp shape: (356251, 1013)


In [8]:
df_full = df_exp.copy()

In [9]:
base = '91_m_lgbm5_best'
model_folder = os.path.join('..', 'expmodel', base)
feats = load_feats(model_folder, 'feats.json')

drop_feats = []
if feats is not None:
    print('Load feats from: {}'.format(base))
    drop_feats = list(set(df_full.columns) - set(feats))
else:
    skip_feats = ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] + drop_feats
    feats = [f for f in df_full.columns if f not in skip_feats]
print('feats num: {}, drop_feats: {}'.format(len(feats), len(drop_feats)))

Load feats from: 91_m_lgbm5_best
feats num: 709, drop_feats: 364


In [10]:
df_full, selected_generated_feats = process_generated_feats(df_full, feats)
print('df_full shape: {}, selected_generated_feats: {}'.format(
    df_full.shape, len(selected_generated_feats)))
df_gen = df_full.copy()
print('df_gen shape: {}, feats num: {}'.format(df_gen.shape, len(feats)))



df_full shape: (356251, 1073), selected_generated_feats: 60
df_gen shape: (356251, 1073), feats num: 709


In [None]:
# https://stackoverflow.com/a/38134049
df_rf = df_gen.copy()
df_rf[feats] = df_rf[feats].fillna(df_rf.median()).replace([np.inf, -np.inf], df_rf.median())

In [None]:
train_df = df_rf[df_rf['TARGET'].notnull()]
test_df = df_rf[df_rf['TARGET'].isnull()]
print("train_df shape:", train_df.shape)
print("test_df shape:", test_df.shape)

X_train = train_df[feats]
X_test = test_df[feats]
y_train = train_df['TARGET']
print("X_train df shape:", X_train.shape)

In [12]:
def feature_importance(n_fold, clf, feats):
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold+1
    return fold_importance_df

In [13]:
import importlib
importlib.reload(ModelWrapper)

r = 1
nums_fold = 5
random_state = 1001
prefix = '106_{}_rf{}_{}'.format('m', nums_fold, str(r).zfill(2))
# prefix = 'xgb_10x_3'
model_folder = os.path.join('..', 'expmodel', '{}'.format(prefix))
params = {}
params = {
    'n_estimators': 56,
    'max_depth': 8,
    'class_weight': 'balanced',
    'min_samples_leaf': 4
}
"""
#             n_estimators=55,
#             max_depth=9,
#             class_weight='balanced',
#             random_state=42,
#             min_samples_leaf=5,

"""
params['n_jobs'] = -1
params['verbose'] = True

fit_params = {}
model = ModelWrapper.ModelWrapper(
    CLF=RandomForestClassifier,
    name="RandomForest",
    model_folder=model_folder,
    feats=feats,
    drop_feats=drop_feats,
    params=params,
    fit_params=fit_params
)

In [14]:
n_fold = 0
random_state = 1001
feature_importance_df = pd.DataFrame()
folds = KFold(n_splits=nums_fold, shuffle=True, random_state=random_state)
for clf, fold_auc in model.folds_train(folds, X_train, y_train, X_test):
    highlight_print(Fore.LIGHTBLUE_EX, '- %.6f (%s)' % (fold_auc, datetime.datetime.now()))
    fold_importance_df = feature_importance(n_fold, clf, feats)
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    n_fold += 1
    del clf , fold_importance_df
    gc.collect()
        
score = roc_auc_score(y_train, model.oof_preds_df)
highlight_print(Fore.RED, '## RF: %.6f' % score)
model.scores.append(score)
model.serialize_scores()

feats num: 709
model folder: ../expmodel/106_m_rf5_01
../expmodel/106_m_rf5_01/RandomForest_0.pickle not exists, going to train.


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [35]:
n_iter = 5
n_splits = 5
prefix = '106_{}_xgb{}_{}_{}'.format('b', n_splits, n_iter, str(r).zfill(2))
model_folder = os.path.join('..', 'expmodel', prefix)

b_round = 0
b_feature_importance_df = pd.DataFrame()

def xgb_evaluate(**params):
    warnings.simplefilter('ignore')

    global b_feature_importance_df
    global b_round

    params['n_estimators'] = 10000
    params['max_depth'] = int(params['max_depth'])
    params['num_leaves'] = int(params['num_leaves'])
    params['verbose'] = True

    submodel_folder = os.path.join(model_folder, str(b_round))
    try:
        os.makedirs(submodel_folder)
    except:
        pass
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=1001)
    b_model = ModelWrapper.ModelWrapper(
        CLF=XGBClassifier,
        name="XGBBoost",
        model_folder=submodel_folder,
        feats=feats,
        drop_feats=drop_feats,
        params=params,
        fit_params=fit_params
    )
    n_fold = 0
    for clf, fold_auc in b_model.folds_train(folds, X_train, y_train, X_test):
        fold_importance_df = feature_importance(n_fold, clf, feats)
        b_feature_importance_df = pd.concat([b_feature_importance_df, fold_importance_df], axis=0)
        n_fold += 1
        del clf, fold_importance_df
        gc.collect()

    score = roc_auc_score(y_train, b_model.oof_preds_df)
    b_model.scores.append(score)
    b_model.serialize_scores()
    b_round += 1
    return score

with timer("BayesianOptimization:"):
    b_params = {'colsample_bytree': (0.8, 1),
              'learning_rate': (.01, .02), 
              'num_leaves': (33, 35), 
              'subsample': (0.8, 1), 
              'max_depth': (7, 9), 
              'reg_alpha': (.03, .05), 
              'reg_lambda': (.06, .08), 
              'min_split_gain': (.01, .03),
              'min_child_weight': (38, 40)}
    bo = BayesianOptimization(xgb_evaluate, b_params)
    bo.maximize(init_points=5, n_iter=5)

submission/lgbm_10xx_5-2018_08_02_13_20_17.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
def process_generated_feats(df, feats):
    sg = {
        '+': [], '-': [], '*': [], '/': [], '^':[]
    }
    x_dup_combos = [
        ('AAAO_SOURCES_MEAN__EMPLOYED_BIRTH', 'AAO_EMPLOYED__BIRTH'),
    ]
    o_dup_combos = [
        ('AAAX_CREDIT_ANNUITY__EMPLOYED', 'DAYS_EMPLOYED'),
        ('AAO_EMPLOYED__BIRTH', 'DAYS_EMPLOYED'),
        ('AAO_ANNUITY__INC', 'AAO_CREDIT__INC'),
    ]
    for feat in feats:
        if feat.startswith('*:') or \
            feat.startswith('/:') or \
            feat.startswith('+:') or \
            feat.startswith('-:') or \
            feat.startswith('^:'):
            operator = feat[0]
            f1, f2 = feat[2:].split('-')
            if operator == '*' and (f1, f2) in x_dup_combos:
                continue
            if operator == '/' and (f1, f2) in o_dup_combos:
                continue
            sg[operator].append((f1, f2))
    operators = {
         '*': np.multiply,
         '/': np.divide,
         '+': np.add,
         '-': np.subtract,
         '^': np.power
    }
    generated_feats = []
    for k, v in sg.items():
        for (f1, f2) in v:
            name = '{}:{}-{}'.format(k, f1, f2)
            generated_feats.append(name)
            df[name] = operators[k](df[f1], df[f2])
    return df, generated_feats