In [1]:
import os
import math
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from IPython.display import display
import lightgbm
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [3]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(12, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('../output/lgbm_importances_bayesian.png')

In [4]:
def kfold_lightgbm(df, num_folds, stratified = False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    
    cat_cols = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
                'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 
                'FLAG_DOCUMENT_18', 'CODE_GENDER', 'NAME_CONTRACT_TYPE',
                'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EMERGENCYSTATE_MODE',
                'HOUSETYPE_MODE', 'FONDKAPREMONT_MODE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                'NAME_HOUSING_TYPE', 'NAME_TYPE_SUITE', 'WALLSMATERIAL_MODE','WEEKDAY_APPR_PROCESS_START',
                'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'TARGET']
    
    included_cat_cols = [i for i in cat_cols if i in list(train_df.columns)]
    included_cat_cols.remove('TARGET')

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df, cat_cols
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        dtrain = lightgbm.Dataset(data=train_df[feats].iloc[train_idx], 
                             label=train_df['TARGET'].iloc[train_idx], 
                             free_raw_data=False, silent=True,
                             categorical_feature=included_cat_cols)
        dvalid = lightgbm.Dataset(data=train_df[feats].iloc[valid_idx], 
                             label=train_df['TARGET'].iloc[valid_idx], 
                             free_raw_data=False, silent=True,
                             categorical_feature=included_cat_cols)

        # LightGBM parameters found by Bayesian optimization
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt', # 'goss'
            'nthread': 4,
            'learning_rate': 0.02,  # 02,
            'num_leaves': 20,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 39.3259775, # 60
            'seed': 0,
            'verbose': -1,
            'metric': 'auc',
        }

        clf = lightgbm.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds= 200,
            verbose_eval=100
        )

        oof_preds[valid_idx] = clf.predict(dvalid.data)
        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))
        del clf, dtrain, dvalid

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    sub_df = test_df[['SK_ID_CURR']].copy()
    sub_df['TARGET'] = sub_preds
    sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

In [None]:
def main():
    
    df = pd.read_csv('../data/processed_data_2.5.csv')
    df = df.drop('Unnamed: 0', axis = 1)
    
    with timer("Run LightGBM with kfold"):
        feat_importance = kfold_lightgbm(df, num_folds= 5, stratified = False) 
        
    return feat_importance


if __name__ == "__main__":
    submission_file_name = "../predictions/lightgbm_pred_bayesian.csv"
    with timer("Full model run"):
        feat_importance = main()
        importance_df = feat_importance.groupby('feature').agg('mean').drop('fold', axis = 1).sort_values('importance')
        importance_df.to_csv('../output/importance.csv')

Starting LightGBM. Train shape: (307511, 821), test shape: (48744, 821)




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.766728	valid_1's auc: 0.757309
[200]	training's auc: 0.78787	valid_1's auc: 0.773757
[300]	training's auc: 0.799993	valid_1's auc: 0.781098
[400]	training's auc: 0.808872	valid_1's auc: 0.785421
[500]	training's auc: 0.816299	valid_1's auc: 0.788178
[600]	training's auc: 0.822765	valid_1's auc: 0.789981
[700]	training's auc: 0.828782	valid_1's auc: 0.791092
[800]	training's auc: 0.834299	valid_1's auc: 0.791774
[900]	training's auc: 0.839302	valid_1's auc: 0.792357
[1000]	training's auc: 0.844084	valid_1's auc: 0.792801
[1100]	training's auc: 0.848575	valid_1's auc: 0.79303
[1200]	training's auc: 0.852885	valid_1's auc: 0.793346
[1300]	training's auc: 0.857021	valid_1's auc: 0.793601
[1400]	training's auc: 0.860971	valid_1's auc: 0.793618
[1500]	training's auc: 0.86483	valid_1's auc: 0.793725
[1600]	training's auc: 0.868429	valid_1's auc: 0.793831
[1700]	training's auc: 0.871913	valid_1's auc: 0.7938



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.766316	valid_1's auc: 0.755496
[200]	training's auc: 0.7876	valid_1's auc: 0.773108
[300]	training's auc: 0.799861	valid_1's auc: 0.780727
[400]	training's auc: 0.808658	valid_1's auc: 0.784931
[500]	training's auc: 0.816305	valid_1's auc: 0.787414
[600]	training's auc: 0.823067	valid_1's auc: 0.788998
[700]	training's auc: 0.829086	valid_1's auc: 0.790116
[800]	training's auc: 0.834441	valid_1's auc: 0.79096
[900]	training's auc: 0.839536	valid_1's auc: 0.79148
[1000]	training's auc: 0.844372	valid_1's auc: 0.791874
[1100]	training's auc: 0.84898	valid_1's auc: 0.792183
[1200]	training's auc: 0.853261	valid_1's auc: 0.792507
[1300]	training's auc: 0.857333	valid_1's auc: 0.792647
[1400]	training's auc: 0.861174	valid_1's auc: 0.792754
[1500]	training's auc: 0.864876	valid_1's auc: 0.792909
[1600]	training's auc: 0.868664	valid_1's auc: 0.793101
[1700]	training's auc: 0.872313	valid_1's auc: 0.793328



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.767505	valid_1's auc: 0.751078
[200]	training's auc: 0.789019	valid_1's auc: 0.76731
[300]	training's auc: 0.801143	valid_1's auc: 0.774596
[400]	training's auc: 0.810104	valid_1's auc: 0.778641
[500]	training's auc: 0.817434	valid_1's auc: 0.781102
[600]	training's auc: 0.824233	valid_1's auc: 0.782769
[700]	training's auc: 0.830115	valid_1's auc: 0.783805
[800]	training's auc: 0.835539	valid_1's auc: 0.784791
[900]	training's auc: 0.840447	valid_1's auc: 0.785347
[1000]	training's auc: 0.845096	valid_1's auc: 0.785972
[1100]	training's auc: 0.849547	valid_1's auc: 0.786487
[1200]	training's auc: 0.853834	valid_1's auc: 0.78673
[1300]	training's auc: 0.857829	valid_1's auc: 0.787129
[1400]	training's auc: 0.861824	valid_1's auc: 0.787227
[1500]	training's auc: 0.865698	valid_1's auc: 0.787373
[1600]	training's auc: 0.869477	valid_1's auc: 0.787583
[1700]	training's auc: 0.872992	valid_1's auc: 0.787



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.767069	valid_1's auc: 0.756492
[200]	training's auc: 0.78885	valid_1's auc: 0.771613
[300]	training's auc: 0.8011	valid_1's auc: 0.778269
[400]	training's auc: 0.809985	valid_1's auc: 0.781967
[500]	training's auc: 0.817507	valid_1's auc: 0.784521
[600]	training's auc: 0.824069	valid_1's auc: 0.786139
[700]	training's auc: 0.829959	valid_1's auc: 0.787184
[800]	training's auc: 0.835436	valid_1's auc: 0.788008
[900]	training's auc: 0.840607	valid_1's auc: 0.788669
[1000]	training's auc: 0.845275	valid_1's auc: 0.789206
[1100]	training's auc: 0.849707	valid_1's auc: 0.789535
