In [2]:
import os
import math
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from IPython.display import display
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [4]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [5]:
def application_train_and_test():
    
    application_train = pd.read_csv('data/application_train.csv')
    application_test = pd.read_csv('data/application_test.csv')
    
    application_train = application_train.sort_values(by = 'SK_ID_CURR')
    application_test = application_test.sort_values(by = 'SK_ID_CURR')
    df = application_train.append(application_test).reset_index()

    df['DAYS_EMPLOYED_MISS'] = df['DAYS_EMPLOYED'] == 365243
    df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

    binary_features = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION',
                       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 
                       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 
                       'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
                       'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
                       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15',
                       'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
                       'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'DAYS_EMPLOYED_MISS']
    for feature in binary_features:
        df[feature], uniques = pd.factorize(df[feature])

    df, df_cat_col = one_hot_encoder(df)
    df = df.drop('index', axis=1)
    
    del application_train
    del application_test
    
    return df

In [6]:
def bureau_and_balance(df):
    
    bureau = pd.read_csv('data/bureau.csv')
    bureau_balance = pd.read_csv('data/bureau_balance.csv')
    
    previous_loans = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loans'})
    df = df.merge(previous_loans, on = 'SK_ID_CURR', how = 'left')
    df['previous_loans'] = df['previous_loans'].fillna(0)
    
    closed_loans = bureau[bureau['CREDIT_ACTIVE'] == 'Closed']
    closed_loans = closed_loans.groupby('SK_ID_CURR', as_index=False)['CREDIT_ACTIVE'].count().rename(columns = {'CREDIT_ACTIVE': 'closed_loans'})
    df = df.merge(closed_loans, on = 'SK_ID_CURR', how = 'left')
    df['closed_loans'] = df['closed_loans'].fillna(0)
    active_loans = bureau[bureau['CREDIT_ACTIVE'] == 'Active']
    active_loans = active_loans.groupby('SK_ID_CURR', as_index=False)['CREDIT_ACTIVE'].count().rename(columns = {'CREDIT_ACTIVE': 'active_loans'})
    df = df.merge(active_loans, on = 'SK_ID_CURR', how = 'left')
    df['active_loans'] = df['active_loans'].fillna(0)
    
    bureau, bureau_cat_cols = one_hot_encoder(bureau)
    bureau_balance, bureau_balance_cat_cols = one_hot_encoder(bureau_balance)

    bureau_agg = bureau.groupby('SK_ID_CURR').agg({'DAYS_CREDIT':['min', 'max', 'mean'],
                                                   'CREDIT_DAY_OVERDUE':['max', 'mean'],
                                                   'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
                                                   'DAYS_ENDDATE_FACT': ['mean'],
                                                   'AMT_CREDIT_MAX_OVERDUE': ['mean'],
                                                   'CNT_CREDIT_PROLONG': ['count'],
                                                   'AMT_CREDIT_SUM': ['min', 'max', 'mean'],
                                                   'AMT_CREDIT_SUM_DEBT': ['min', 'max', 'mean'],
                                                   'AMT_CREDIT_SUM_LIMIT': ['sum', 'mean'],
                                                   'AMT_CREDIT_SUM_OVERDUE': ['mean'],
                                                   'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean'],
                                                   'AMT_ANNUITY': ['max', 'mean']})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    df = df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
    
    bureau_balance_and_bureau = bureau.merge(bureau_balance, on = 'SK_ID_BUREAU', how = 'left')
    bureau_balance_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bureau_balance_cat_cols:
        bureau_balance_aggregations[col] = ['mean']
    bureau_balance_agg = bureau_balance_and_bureau.groupby('SK_ID_CURR').agg(bureau_balance_aggregations)
    bureau_balance_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bureau_balance_agg.columns.tolist()])
    for row in range(len(bureau_balance_agg)):
        if np.isnan(bureau_balance_agg.iloc[row, 0]) == True and np.isnan(bureau_balance_agg.iloc[row, 1]) == True:
            bureau_balance_agg.iloc[row, 2] = np.nan

    df = df.merge(bureau_balance_agg, on = 'SK_ID_CURR', how = 'left')
    
    del bureau
    del bureau_balance
    del bureau_agg
    del previous_loans
    del closed_loans
    del active_loans
    del bureau_balance_and_bureau
    
    return df

In [7]:
def cc_balance(df):
    
    credit_card_balance = pd.read_csv('data/credit_card_balance.csv')
    
    prev_credit_months = credit_card_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV'], as_index=False)['MONTHS_BALANCE'].count().rename(columns = {'MONTHS_BALANCE': 'prev_credit_months'})
    prev_credit_months['prev_credit_months'] = -prev_credit_months['prev_credit_months']
    previous_credit_loans = pd.DataFrame()
    previous_credit_loans['SK_ID_CURR'] = prev_credit_months['SK_ID_CURR'].value_counts().index.values
    previous_credit_loans['previous_credit_loans'] = prev_credit_months['SK_ID_CURR'].value_counts().values
    df = df.merge(previous_credit_loans, on = 'SK_ID_CURR', how = 'left')
    df['previous_credit_loans'] = df['previous_credit_loans'].fillna(0)
    
    current_loan_status = credit_card_balance[['SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_STATUS']].sort_values(by = ['SK_ID_CURR', 'NAME_CONTRACT_STATUS']).drop_duplicates()
    current_loan_status = current_loan_status[current_loan_status['NAME_CONTRACT_STATUS'].isin(['Active', 'Completed'])]
    
    prev_credit_completed = current_loan_status[current_loan_status['NAME_CONTRACT_STATUS'] == 'Completed']
    prev_credit_completed = pd.get_dummies(prev_credit_completed)
    del prev_credit_completed['SK_ID_PREV']
    prev_credit_completed = prev_credit_completed.groupby('SK_ID_CURR', as_index=False)['NAME_CONTRACT_STATUS_Completed'].count()
    df = df.merge(prev_credit_completed, on = 'SK_ID_CURR', how = 'left')
    df['NAME_CONTRACT_STATUS_Completed'] = df['NAME_CONTRACT_STATUS_Completed'].fillna(0)
    
    prev_credit_active = current_loan_status.drop_duplicates(subset = ['SK_ID_CURR', 'SK_ID_PREV'], keep = False)
    prev_credit_active = prev_credit_active[prev_credit_active['NAME_CONTRACT_STATUS'] == 'Active']
    prev_credit_active = pd.get_dummies(prev_credit_active)
    del prev_credit_active['SK_ID_PREV']
    prev_credit_active = prev_credit_active.groupby('SK_ID_CURR', as_index=False)['NAME_CONTRACT_STATUS_Active'].count()
    df = df.merge(prev_credit_active, on = 'SK_ID_CURR', how = 'left')
    df['NAME_CONTRACT_STATUS_Active'] = df['NAME_CONTRACT_STATUS_Active'].fillna(0)
        
    credit_card_balance.drop(columns = ['NAME_CONTRACT_STATUS', 'SK_ID_PREV'], inplace = True)
    credit_card_balance, credit_card_balance_cat_cols = one_hot_encoder(credit_card_balance)
    
    cc_agg = credit_card_balance.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    
    df = df.merge(cc_agg, on = 'SK_ID_CURR', how = 'left')
    
    del credit_card_balance
    del prev_credit_months
    del previous_credit_loans
    del current_loan_status
    del prev_credit_completed
    del prev_credit_active
    del cc_agg
    
    return df

In [8]:
def installments(df):
    
    installments_payments = pd.read_csv('data/installments_payments.csv')

    installments_day_diff = installments_payments.sort_values(by = ['SK_ID_CURR', 'SK_ID_PREV'])[['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT']]
    installments_day_diff['installments_day_diff'] = installments_day_diff['DAYS_INSTALMENT'] - installments_day_diff['DAYS_ENTRY_PAYMENT']
    installments_day_diff_agg = installments_day_diff.groupby(['SK_ID_CURR', 'SK_ID_PREV'], as_index=False)['installments_day_diff'].mean()
    del installments_day_diff_agg['SK_ID_PREV']
    
    installments_day_diff_agg = installments_day_diff_agg.groupby('SK_ID_CURR', as_index=False)['installments_day_diff'].mean()
    df = df.merge(installments_day_diff_agg, on = 'SK_ID_CURR', how = 'left')
    del installments_payments['DAYS_INSTALMENT']
    del installments_payments['DAYS_ENTRY_PAYMENT']

    installments_diff = installments_payments.sort_values(by = ['SK_ID_CURR', 'SK_ID_PREV'])[['SK_ID_CURR', 'SK_ID_PREV', 'AMT_INSTALMENT', 'AMT_PAYMENT']]
    installments_diff['installments_diff'] = installments_diff['AMT_INSTALMENT'] - installments_diff['AMT_PAYMENT']
    installments_diff_agg = installments_diff.groupby(['SK_ID_CURR', 'SK_ID_PREV'], as_index=False)['installments_diff'].mean()
    installments_diff_agg.head()
    del installments_diff_agg['SK_ID_PREV']

    installments_diff_agg = installments_diff_agg.groupby('SK_ID_CURR', as_index=False)['installments_diff'].mean()
    df = df.merge(installments_diff_agg, on = 'SK_ID_CURR', how = 'left')
    
    del installments_payments
    del installments_day_diff
    del installments_day_diff_agg
    
    return df

In [9]:
def pos_cash(df):
    
    POS_CASH_balance = pd.read_csv('data/POS_CASH_balance.csv')

    prev_POS_months = POS_CASH_balance.groupby(['SK_ID_CURR', 'SK_ID_PREV'], as_index=False)['MONTHS_BALANCE'].count().rename(columns = {'MONTHS_BALANCE': 'prev_POS_months'})
    prev_POS_months['prev_POS_months'] = -prev_POS_months['prev_POS_months']
    previous_POS_loans = pd.DataFrame()
    previous_POS_loans['SK_ID_CURR'] = prev_POS_months['SK_ID_CURR'].value_counts().index.values
    previous_POS_loans['previous_POS_loans'] = prev_POS_months['SK_ID_CURR'].value_counts().values
    df = df.merge(previous_POS_loans, on = 'SK_ID_CURR', how = 'left')
    df['previous_POS_loans'] = df['previous_POS_loans'].fillna(0)

    current_POS_status = POS_CASH_balance[['SK_ID_CURR', 'SK_ID_PREV', 'NAME_CONTRACT_STATUS']].sort_values(by = ['SK_ID_CURR', 'NAME_CONTRACT_STATUS']).drop_duplicates()
    current_POS_status = current_POS_status[current_POS_status['NAME_CONTRACT_STATUS'].isin(['Active', 'Completed'])]

    prev_POS_completed = current_POS_status[current_POS_status['NAME_CONTRACT_STATUS'] == 'Completed']
    prev_POS_completed = pd.get_dummies(prev_POS_completed)
    del prev_POS_completed['SK_ID_PREV']
    prev_POS_completed = prev_POS_completed.rename(columns = {'NAME_CONTRACT_STATUS_Completed': 'NAME_CONTRACT_STATUS_Completed_POS'})
    prev_POS_completed = prev_POS_completed.groupby('SK_ID_CURR', as_index=False)['NAME_CONTRACT_STATUS_Completed_POS'].count()
    df = df.merge(prev_POS_completed, on = 'SK_ID_CURR', how = 'left')
    df['NAME_CONTRACT_STATUS_Completed_POS'] = df['NAME_CONTRACT_STATUS_Completed_POS'].fillna(0)

    prev_POS_active = current_POS_status.drop_duplicates(subset = ['SK_ID_CURR', 'SK_ID_PREV'], keep = False)
    prev_POS_active = prev_POS_active[prev_POS_active['NAME_CONTRACT_STATUS'] == 'Active']
    prev_POS_active = pd.get_dummies(prev_POS_active)
    del prev_POS_active['SK_ID_PREV']
    prev_POS_active = prev_POS_active.rename(columns = {'NAME_CONTRACT_STATUS_Active': 'NAME_CONTRACT_STATUS_Active_POS'})
    prev_POS_active = prev_POS_active.groupby('SK_ID_CURR', as_index=False)['NAME_CONTRACT_STATUS_Active_POS'].count()
    df = df.merge(prev_POS_active, on = 'SK_ID_CURR', how = 'left')
    df['NAME_CONTRACT_STATUS_Active_POS'] = df['NAME_CONTRACT_STATUS_Active_POS'].fillna(0)

    POS_CASH_balance.drop(columns = ['NAME_CONTRACT_STATUS', 'SK_ID_PREV'], inplace = True) 
    POS_CASH_balance_agg = POS_CASH_balance.groupby('SK_ID_CURR').agg(['min', 'max', 'mean'])
    POS_CASH_balance_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in POS_CASH_balance_agg.columns.tolist()])
    
    df = df.merge(POS_CASH_balance_agg, on = 'SK_ID_CURR', how = 'left')
    
    del prev_POS_months
    del previous_POS_loans
    del current_POS_status
    del prev_POS_completed
    del prev_POS_active
    del POS_CASH_balance
    del POS_CASH_balance_agg
    
    return df

In [10]:
def prev_app(df):

    previous_application = pd.read_csv('data/previous_application.csv')

    previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

    previous_application, cat_cols = one_hot_encoder(previous_application)
    
    num_aggregations = {'AMT_ANNUITY': ['min', 'max', 'mean'],
                        'AMT_APPLICATION': ['min', 'max', 'mean'],
                        'AMT_CREDIT': ['min', 'max', 'mean'],
                        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
                        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
                        'HOUR_APPR_PROCESS_START': ['mean', 'median'],
                        'NFLAG_LAST_APPL_IN_DAY': ['mean', 'size'],
                        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
                        'RATE_INTEREST_PRIMARY': ['min', 'max', 'mean'],
                        'RATE_INTEREST_PRIVILEGED': ['min', 'max', 'mean'],
                        'DAYS_DECISION': ['min', 'max', 'mean'],
                        'CNT_PAYMENT': ['sum', 'mean']}

    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    prev_agg = previous_application.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

    approved = previous_application[previous_application['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APR_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

    refused = previous_application[previous_application['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REF_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

    df = df.merge(prev_agg, on = 'SK_ID_CURR', how = 'left')
    
    del previous_application
    del prev_agg
    del approved
    del approved_agg
    del refused
    del refused_agg
    
    return df

In [11]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')

In [12]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    # This is not handling multi-level indexing correctly
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

In [14]:
def main():
    df = application_train_and_test()
    with timer("Process bureau and bureau_balance"):
        df = bureau_and_balance(df)
        print("df shape:", df.shape)
    with timer("Process credit card balance"):
        df = cc_balance(df)
        print("df shape:", df.shape)
    with timer("Process installments payments"):
        df = installments(df)
        print("df shape:", df.shape)
    with timer("Process POS_CASH_balance"):
        df = pos_cash(df)
        print("df shape:", df.shape)
    with timer("Process previous_applications"):
        df = prev_app(df)
        print("df shape:", df.shape)

    with timer("Run LightGBM with kfold"):
        feat_importance = kfold_lightgbm(df, num_folds= 5, stratified = True)
        
    return df, feat_importance

if __name__ == "__main__":
    submission_file_name = "predictions/lightgbm_pred.csv"
    with timer("Full model run"):
        df, feat_importance = main()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


df shape: (356255, 299)
Process bureau and bureau_balance - done in 98s
df shape: (356255, 402)
Process credit card balance - done in 28s
df shape: (356255, 404)
Process installments payments - done in 33s
df shape: (356255, 422)
Process POS_CASH_balance - done in 27s
df shape: (356255, 680)
Process previous_applications - done in 45s
Starting LightGBM. Train shape: (307511, 680), test shape: (48744, 680)
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.772002	valid_1's auc: 0.755761
[200]	training's auc: 0.794438	valid_1's auc: 0.770764
[300]	training's auc: 0.807579	valid_1's auc: 0.777477
[400]	training's auc: 0.817184	valid_1's auc: 0.780922
[500]	training's auc: 0.82528	valid_1's auc: 0.782994
[600]	training's auc: 0.832569	valid_1's auc: 0.78421
[700]	training's auc: 0.838993	valid_1's auc: 0.784874


KeyboardInterrupt: 

Use feature importances graph to focus data preprocessing