In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import sys
sys.path.append('../../')
%matplotlib inline

In [None]:
with open('./kfold_model.pkl', 'rb') as f:
    kfold_lgb = pickle.load(f)

In [None]:
kfold_result = kfold_lgb.kfold_result_
for n_fold in kfold_result.keys():
    fold_res = kfold_result[n_fold]
    print(n_fold)
    print("%-30s %d" % ('BEST_ITER:', fold_res['best_iteration']))
    print("%-30s %.6f" % ('BEST_SCORE_TRAIN:', fold_res['best_score']['training']))
    print("%-30s %.6f" % ('BEST_SCORE_VALID:', fold_res['best_score']['valid']))
    print("%-30s %d" % ('TRAINING_USING_TIME:', fold_res['using_time']))
    print("="* 50)
print("%-30s %.6f" % ('FULL AUC:', kfold_lgb.score_))

<font size=5> EVAL RESULT

In [None]:
# plt.figure(figsize=(12, 28))
fold_nums = len(kfold_result)
for n_fold, fold_res in kfold_result.items():
    df = pd.DataFrame(fold_res['evals_result'])
    df.plot(kind='line', figsize=(8, 6), fontsize=12)
    plt.title('eval result in fold %s' % n_fold)
    plt.tight_layout()

<font size=5> Feature Importance

In [None]:
feature_importance_df = pd.DataFrame()
for n_fold, fold_result in kfold_result.items():
    feature_importance_df[n_fold] = fold_result['feature_importance']
feature_importance_df.index = kfold_lgb.features_
feature_importance_df['mean'] = feature_importance_df.mean(axis=1)
feature_importance_df['std'] = feature_importance_df.std(axis=1)
feature_importance_df['rank'] = feature_importance_df['mean'].rank(ascending=False)

In [None]:
feature_importance_df.sort_values(by='mean', ascending=False)[:100]

In [None]:
def display_importances(feature_importance_df_):
    plt.figure(figsize=(12, 28))
    top100 = feature_importance_df_.sort_values(by='mean', ascending=False)[['mean', 'std']][: 100]
    top100['mean'].plot(kind='barh', xerr=top100['std'])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

display_importances(feature_importance_df)

<font size=5> DETAILED Feature Importance

In [None]:
def query(name):
    lst = []
    feats_len = len(feature_importance_df)
    for col in feature_importance_df.index:
        if name in col:
            lst.append(col)
    
    print('    %s     %-90s %s' % ('RANK', 'FEATURE NAME', 'IMPORTANCE'))
    print('='* 115)
    for feat in lst:
        s = '%4d/%s    %-90s %.2f' % (feature_importance_df.loc[feat, 'rank'], feats_len, feat, feature_importance_df.loc[feat, 'mean'])
        print(s)

In [None]:
df = feature_importance_df.sort_values(by='mean', ascending=False)
print('    %s     %-90s %s' % ('RANK', 'FEATURE NAME', 'IMPORTANCE'))
print('='* 115)
feats_len = len(df)
for i, feat in enumerate(df.index, start=1):
    s = '%4d/%s    %-90s %.2f' % (i, feats_len, feat, df.loc[feat, 'mean'])
    print(s)

In [None]:
for col in feature_importance_df.index:
    if feature_importance_df.loc[col, 'mean'] <= 1:
        print(col)