In [1]:
import os
import math
import time

from contextlib import contextmanager
import numpy as np
import pandas as pd
from IPython.display import display
import lightgbm
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [3]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(12, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('../output/lgbm_importances_bayesian.png')

In [4]:
def kfold_lightgbm(num_folds, stratified = False, corr_save = False, importance_save = False):
    
    train_df = pd.read_csv('../data/train_cv.csv')
    test_df = pd.read_csv('../data/test_cv.csv')
    
    # Correlation csv processing
    if corr_save == True:
        target_corr = train_df.corr()['TARGET'].sort_values()
        corr_df = pd.DataFrame()
        corr_df['feature'] = target_corr.index
        corr_df['corr'] = target_corr.values
        corr_df = corr_df[corr_df['feature'] != 'feature']
        corr_df.to_csv('../output/correlation_val.csv')
        del target_corr, corr_df

    # Create list of categorical columns
    cat_cols = ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
                'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
                'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
                'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
                'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_11', 
                'FLAG_DOCUMENT_18', 'CODE_GENDER', 'NAME_CONTRACT_TYPE',
                'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'EMERGENCYSTATE_MODE',
                'HOUSETYPE_MODE', 'FONDKAPREMONT_MODE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                'NAME_HOUSING_TYPE', 'NAME_TYPE_SUITE', 'WALLSMATERIAL_MODE','WEEKDAY_APPR_PROCESS_START',
                'NAME_INCOME_TYPE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'TARGET', 'FLAG_MOBIL_EMP_WORK_PHONE',
                'FLAG_CAR_AND_REALTY', 'FLAG_GENDER_AND_CAR', 'FLAG_GENDER_AND_REALTY',
                'FLAG_GENDER_AND_PHONE', 'FLAG_GENDER_AND_WORK_PHONE', 'FLAG_GENDER_AND_EMAIL',
                'REGION_RATING_W_CAR', 'REGION_RATING_W_REALTY',
                'REGION_RATING_W_EMP_PHONE', 'REGION_RATING_W_WORK_PHONE', 'REGION_RATING_W_PHONE', 
                'REGION_RATING_CITY_W_CAR', 'REGION_RATING_CITY_W_REALTY',
                'REGION_RATING_CITY_W_EMP_PHONE', 'REGION_RATING_CITY_W_WORK_PHONE',
                'REGION_RATING_CITY_W_PHONE', 'REGION_RATING_CITY_W_EMAIL', 'REGION_RATING_W_CITY_PROD', 
                'HOUR_APPR_PROCESS_START', 'FLAG_REG_CITY_NOT_LIVE_WORK']
    
    included_cat_cols = [i for i in cat_cols if i in list(train_df.columns)]
    included_cat_cols.remove('TARGET')
    
    # Delete variables from memory 
    del cat_cols

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    # Initialise predictions and importance dataframes and epoch weights
    feature_importance_df = pd.DataFrame()    
    
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    sub_preds = np.zeros(test_df.shape[0])
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        dtrain = lightgbm.Dataset(data=train_df[feats].iloc[train_idx], 
                             label=train_df['TARGET'].iloc[train_idx], 
                             free_raw_data=False, silent=True,
                             categorical_feature=included_cat_cols)
        dvalid = lightgbm.Dataset(data=train_df[feats].iloc[valid_idx], 
                             label=train_df['TARGET'].iloc[valid_idx], 
                             free_raw_data=False, silent=True,
                             categorical_feature=included_cat_cols)

        # LightGBM parameters found by Bayesian optimization
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt', # 'goss'
            'nthread': 4,
            'learning_rate': 0.02,  # 02,
            'num_leaves': 20,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 39.3259775, # 60
            'seed': 0,
            'verbose': -1,
            'metric': 'auc',
        }

        clf = lightgbm.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds= 200,
            verbose_eval=100
        )

        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        del clf, dtrain, dvalid
            
    target = np.genfromtxt('../data/target_cv.csv', delimiter=',')
    cv_auc = roc_auc_score(target, sub_preds)
    print('AUC on validation set: {}'.format(cv_auc))
    
    # Save feature importance df as csv
    if importance_save == True:
        feature_importance_df = feature_importance_df.groupby('feature').agg('mean').drop('fold', axis = 1).sort_values('importance')
        feature_importance_df.to_csv('../output/importance_val.csv')

In [5]:
def main():
        
    with timer("Run LightGBM with kfold"):
        feature_importance_df = kfold_lightgbm(num_folds=5,
                                               stratified=False,
                                               corr_save=False,
                                               importance_save=False) 

if __name__ == "__main__":
    with timer("Full model run"):
        main()

Starting LightGBM. Train shape: (246008, 968), test shape: (61503, 968)




Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.771664	valid_1's auc: 0.75885
[200]	training's auc: 0.793536	valid_1's auc: 0.77372
[300]	training's auc: 0.806165	valid_1's auc: 0.780194
[400]	training's auc: 0.815912	valid_1's auc: 0.783759
[500]	training's auc: 0.824394	valid_1's auc: 0.78609
[600]	training's auc: 0.831846	valid_1's auc: 0.787767
[700]	training's auc: 0.83855	valid_1's auc: 0.788722
[800]	training's auc: 0.844615	valid_1's auc: 0.789556
[900]	training's auc: 0.850296	valid_1's auc: 0.789829
[1000]	training's auc: 0.855847	valid_1's auc: 0.790267
[1100]	training's auc: 0.861026	valid_1's auc: 0.790787
[1200]	training's auc: 0.865993	valid_1's auc: 0.791014
[1300]	training's auc: 0.870832	valid_1's auc: 0.791211
[1400]	training's auc: 0.875321	valid_1's auc: 0.791369
[1500]	training's auc: 0.879684	valid_1's auc: 0.79152
[1600]	training's auc: 0.883784	valid_1's auc: 0.791652
[1700]	training's auc: 0.887801	valid_1's auc: 0.791795



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.771157	valid_1's auc: 0.759719
[200]	training's auc: 0.79318	valid_1's auc: 0.775017
[300]	training's auc: 0.805996	valid_1's auc: 0.781578
[400]	training's auc: 0.815974	valid_1's auc: 0.784948
[500]	training's auc: 0.824446	valid_1's auc: 0.786899
[600]	training's auc: 0.832052	valid_1's auc: 0.787999
[700]	training's auc: 0.838904	valid_1's auc: 0.788967
[800]	training's auc: 0.845134	valid_1's auc: 0.789589
[900]	training's auc: 0.850773	valid_1's auc: 0.789946
[1000]	training's auc: 0.85619	valid_1's auc: 0.790331
[1100]	training's auc: 0.861537	valid_1's auc: 0.790659
[1200]	training's auc: 0.866369	valid_1's auc: 0.790602
[1300]	training's auc: 0.871103	valid_1's auc: 0.790698
[1400]	training's auc: 0.875623	valid_1's auc: 0.790708
[1500]	training's auc: 0.879967	valid_1's auc: 0.790783
[1600]	training's auc: 0.884055	valid_1's auc: 0.790821
[1700]	training's auc: 0.888041	valid_1's auc: 0.790



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.77171	valid_1's auc: 0.759328
[200]	training's auc: 0.793245	valid_1's auc: 0.773459
[300]	training's auc: 0.806241	valid_1's auc: 0.779807
[400]	training's auc: 0.816284	valid_1's auc: 0.782935
[500]	training's auc: 0.824765	valid_1's auc: 0.784971
[600]	training's auc: 0.832381	valid_1's auc: 0.786175
[700]	training's auc: 0.839253	valid_1's auc: 0.787174
[800]	training's auc: 0.845634	valid_1's auc: 0.78786
[900]	training's auc: 0.851408	valid_1's auc: 0.788436
[1000]	training's auc: 0.856798	valid_1's auc: 0.788683
[1100]	training's auc: 0.86215	valid_1's auc: 0.788904
[1200]	training's auc: 0.867049	valid_1's auc: 0.788983
[1300]	training's auc: 0.87188	valid_1's auc: 0.78928
[1400]	training's auc: 0.876461	valid_1's auc: 0.78934
[1500]	training's auc: 0.88071	valid_1's auc: 0.789427
[1600]	training's auc: 0.884811	valid_1's auc: 0.789117
[1700]	training's auc: 0.888792	valid_1's auc: 0.789067
E



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.771199	valid_1's auc: 0.760083
[200]	training's auc: 0.792698	valid_1's auc: 0.775226
[300]	training's auc: 0.805616	valid_1's auc: 0.781909
[400]	training's auc: 0.815576	valid_1's auc: 0.785777
[500]	training's auc: 0.824181	valid_1's auc: 0.787972
[600]	training's auc: 0.831771	valid_1's auc: 0.789273
[700]	training's auc: 0.83846	valid_1's auc: 0.790191
[800]	training's auc: 0.844435	valid_1's auc: 0.790922
[900]	training's auc: 0.850517	valid_1's auc: 0.791334
[1000]	training's auc: 0.856105	valid_1's auc: 0.791652
[1100]	training's auc: 0.861364	valid_1's auc: 0.792021
[1200]	training's auc: 0.866327	valid_1's auc: 0.792242
[1300]	training's auc: 0.87106	valid_1's auc: 0.792506
[1400]	training's auc: 0.875452	valid_1's auc: 0.79261
[1500]	training's auc: 0.87977	valid_1's auc: 0.792825
[1600]	training's auc: 0.884	valid_1's auc: 0.792832
[1700]	training's auc: 0.888166	valid_1's auc: 0.792779
[



Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.77203	valid_1's auc: 0.754441
[200]	training's auc: 0.793917	valid_1's auc: 0.770324
[300]	training's auc: 0.806708	valid_1's auc: 0.776604
[400]	training's auc: 0.816707	valid_1's auc: 0.780463
[500]	training's auc: 0.825315	valid_1's auc: 0.782628
[600]	training's auc: 0.832643	valid_1's auc: 0.783747
[700]	training's auc: 0.83941	valid_1's auc: 0.784675
[800]	training's auc: 0.845351	valid_1's auc: 0.785296
[900]	training's auc: 0.851165	valid_1's auc: 0.785529
[1000]	training's auc: 0.856535	valid_1's auc: 0.785736
[1100]	training's auc: 0.861698	valid_1's auc: 0.786135
[1200]	training's auc: 0.866701	valid_1's auc: 0.786558
[1300]	training's auc: 0.871286	valid_1's auc: 0.786807
[1400]	training's auc: 0.875792	valid_1's auc: 0.7868
[1500]	training's auc: 0.880224	valid_1's auc: 0.786805
[1600]	training's auc: 0.884207	valid_1's auc: 0.786977
[1700]	training's auc: 0.888274	valid_1's auc: 0.78692