In [172]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',300)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 6.1
    AUTHOR = 'naokisusami'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }
    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    model_weight_dict = {'lightgbm': 0.50, 'xgboost': 0.10, 'catboost': 0.40}
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')


In [173]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)
default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 'ApprovalFY']
default_categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector','FranchiseCode','City','ApprovalDate',
                                'DisbursementDate','BankState']
new_categorical_features = []

In [174]:
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        
        return df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    df = deal_missing(input_df)
    df = clean_money(df)
    df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        
        #Bankraptcydataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}

        #年ごとのデータを、1-5年後の平均に変換
        datalist = [Bankraptcydata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = k[-26]*2
        
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)


        #組み合わせ特徴量
        
        CList = ['Term','NoEmp','CreateJob','RetainedJob','FranchiseCode','RevLineCr','LowDoc','DisbursementDate','ApprovalDate','City',
                'State','BankState','DisbursementGross','GrAppv','SBA_Appv']
        for i in range(len(CList)-2):
            for j in range(i+1,len(CList)):
                a,b = CList[i],CList[j]
                df[f'{a}_{b}'] = df[a].astype(str) + '_' + df[b].astype(str)
            
                


        
        return df
    df = make_features(df)
    return df

In [175]:
#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)

In [176]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Columns: 131 entries, Term to DisbursementGross_SBA_Appv
dtypes: float64(4), int32(3), int64(13), object(111)
memory usage: 42.1+ MB


（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [177]:
'''
#カウントエンコーディング
for col in categorical_features:
    count_dict = dict(train_df[col].value_counts())
    train_df[f'{col}_count_encoding'] = train_df[col].map(count_dict).astype(int)
    test_df[f'{col}_count_encoding'] = test_df[col].map(count_dict).fillna(1).astype(int)
'''
#ラベルエンコーディング
categorical_features = default_categorical_features + new_categorical_features
for col in categorical_features:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

'''
#ラベルエンコーディング
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
categorical_features_unlabelable = ['City','ApprovalDate','DisbursementDate','State_Sector']
for col in categorical_features_unlabelable:
    le = LabelEncoder()   
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))
'''


"\n#ラベルエンコーディング\nfor col in categorical_features :\n    le = LabelEncoder()\n    le.fit(train_df[col])\n    train_df[col] = le.transform(train_df[col])\n    test_df[col] = le.transform(test_df[col])\n    \ncategorical_features_unlabelable = ['City','ApprovalDate','DisbursementDate','State_Sector']\nfor col in categorical_features_unlabelable:\n    le = LabelEncoder()   \n    le.fit(train_df[col])\n    train_df[col] = le.transform(train_df[col])\n    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))\n"

In [178]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Columns: 131 entries, Term to DisbursementGross_SBA_Appv
dtypes: float64(4), int32(10), int64(117)
memory usage: 41.0 MB


In [179]:
'''
#OneHotEncoding
train_df2 = train_df.drop(['MIS_Status'],axis=1)
OneHotList = ['RevLineCr', 'LowDoc']
ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
train_df2 = ohe.fit_transform(train_df2)
test_df = ohe.transform(test_df)
train_df = pd.concat([train_df2,train_df['MIS_Status']],axis=1)
'''

#featuresの作成
add_features = ['Term_NoEmp', 'FranchiseCode_ApprovalDate', 'City_BankState', 'NoEmp_DisbursementDate', 'Term_BankState',
            'Term_State', 'NoEmp_SBA_Appv', 'NoEmp_State', 'NoEmp_City', 'DisbursementGross_SBA_Appv',
            'RevLineCr_City', 'FranchiseCode_City', 'RevLineCr_DisbursementDate', 'FranchiseCode_DisbursementDate',
            'FranchiseCode_GrAppv', 'RevLineCr_LowDoc', 'State_SBA_Appv', 'NoEmp_LowDoc', 'Term_RetainedJob',
            'RevLineCr_ApprovalDate', 'State_GrAppv', 'Term_RevLineCr', 'NoEmp_BankState', 'BankState_SBA_Appv',
            'Term_CreateJob', 'RevLineCr_State', 'LowDoc_SBA_Appv', 'DisbursementDate_BankState', 'Term_LowDoc', 'Term_GrAppv',
            'BankState_GrAppv', 'NoEmp_RetainedJob', 'RetainedJob_BankState', 'CreateJob_RetainedJob', 'LowDoc_ApprovalDate',
            'DisbursementGross_GrAppv', 'LowDoc_City', 'CreateJob_DisbursementDate', 'FranchiseCode_SBA_Appv', 'NoEmp_GrAppv',
            'LowDoc_DisbursementDate', 'BankState_DisbursementGross', 'RevLineCr_GrAppv', 'DisbursementDate_State', 
            'Term_FranchiseCode', 'RetainedJob_State', 'CreateJob_City', 'CreateJob_State', 'NoEmp_ApprovalDate',
            'FranchiseCode_DisbursementGross', 'CreateJob_SBA_Appv', 'ApprovalDate_BankState', 'State_DisbursementGross',
            'LowDoc_State', 'RetainedJob_ApprovalDate', 'NoEmp_CreateJob', 'Term_City', 'SBA_Appv', 'Term_SBA_Appv', 'City',
            'CreateJob_LowDoc', 'LowDoc_GrAppv', 'RevLineCr_DisbursementGross', 'RevLineCr_SBA_Appv', 'RetainedJob_City',
            'LowDoc_DisbursementGross', 'RetainedJob_DisbursementDate', 'NoEmp_RevLineCr', 'ApprovalDate_State', 'City_State', 
            'City_DisbursementGross', 'City_GrAppv', 'State_BankState', 'CreateJob', 'Sector', 'CreateJob_BankState',
            'CreateJob_GrAppv', 'Term_DisbursementGross', 'RevLineCr_BankState', 'CreateJob_DisbursementGross', 'Term_ApprovalDate',
            'NoEmp_DisbursementGross','CreateJob_RevLineCr', 'DisbursementDate_GrAppv', 'FranchiseCode_RevLineCr',
             'RetainedJob_GrAppv', 'City_SBA_Appv', 'FranchiseCode_LowDoc', 'RetainedJob_SBA_Appv', 
            'CreateJob_ApprovalDate', 'RetainedJob_DisbursementGross', 'ApprovalDate_GrAppv', 'RetainedJob_RevLineCr', 
            'RetainedJob_LowDoc', 'Term_DisbursementDate', 'ApprovalDate_SBA_Appv', 'DisbursementDate_SBA_Appv',
            'ApprovalDate_DisbursementGross', 'RetainedJob_FranchiseCode', 'ApprovalDate_City', 
            'DisbursementDate_DisbursementGross', 'NoEmp_FranchiseCode', 'DisbursementDate_City',
            'DisbursementDate_ApprovalDate', 'FranchiseCode_State', 'FranchiseCode_BankState',
            'CreateJob_FranchiseCode', 'LowDoc_BankState']
add_features = add_features[0:5]
categorical_features = default_categorical_features + add_features
numerical_features = default_numerical_features + ['Bankraptcy_By_Year']
features = numerical_features + categorical_features
'''
RemoveList=['MIS_Status','ApprovalDay','ApprovalMonth','ApprovalFY','ApprovalYear','DisbursementDay','DisbursementMonth',
           'DisbursementYear']
features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)
'''

"\nRemoveList=['MIS_Status','ApprovalDay','ApprovalMonth','ApprovalFY','ApprovalYear','DisbursementDay','DisbursementMonth',\n           'DisbursementYear']\nfeatures = train_df.columns.tolist()\nfor i in RemoveList:\n    print(i)\n    features.remove(i)\n"

In [180]:
print(train_df)
print(features)

       Term  NoEmp  NewExist  CreateJob  RetainedJob  FranchiseCode  \
0       163     21         1          0            0              1   
1        84      6         1          4            0              0   
2       242     45         1          4           90              0   
3       237      4         1          0            0              0   
4       184      0         1          0            0              0   
...     ...    ...       ...        ...          ...            ...   
42302   283     14         1          0            0              1   
42303    53      2         1          0            0              0   
42304    59      6         0          0            0              1   
42305   295     18         1          0            8              0   
42306    84      4         1          0            8              0   

       RevLineCr  LowDoc  DisbursementDate  MIS_Status  Sector  ApprovalDate  \
0              1       3               730           1       0     

In [181]:
#lightgbmでの学習メソッドの定義
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

#xgboostでの学習メソッドの定義
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

#catboostでの学習メソッドの定義
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred
#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)

        # Save best model
        pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [182]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 32336, number of negative: 3927
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17781
[LightGBM] [Info] Number of data points in the train set: 36263, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.891708 -> initscore=2.108305
[LightGBM] [Info] Start training from score 2.108305
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[17]	training's auc: 0.877079	training's f1score: 0.471377	valid_1's auc: 0.75175	valid_1's f1score: 0.47329
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Number of positive: 32389, number of negative: 3874
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin

[300]	train-logloss:0.21830	train-f1score:0.72821	eval-logloss:0.27750	eval-f1score:0.65377
[325]	train-logloss:0.21471	train-f1score:0.73396	eval-logloss:0.27776	eval-f1score:0.65066
[350]	train-logloss:0.21208	train-f1score:0.73781	eval-logloss:0.27812	eval-f1score:0.65232
[375]	train-logloss:0.20899	train-f1score:0.74362	eval-logloss:0.27852	eval-f1score:0.65303
[400]	train-logloss:0.20614	train-f1score:0.75167	eval-logloss:0.27886	eval-f1score:0.65422
[425]	train-logloss:0.20366	train-f1score:0.75708	eval-logloss:0.27922	eval-f1score:0.65232
[450]	train-logloss:0.20051	train-f1score:0.76255	eval-logloss:0.27931	eval-f1score:0.65349
[475]	train-logloss:0.19779	train-f1score:0.76829	eval-logloss:0.27980	eval-f1score:0.65181
[499]	train-logloss:0.19518	train-f1score:0.77332	eval-logloss:0.28021	eval-f1score:0.65086
--------------------------------------------------
xgboost training fold 2
[0]	train-logloss:0.65992	train-f1score:0.09652	eval-logloss:0.66040	eval-f1score:0.09925
[25]	tr

[375]	train-logloss:0.20874	train-f1score:0.74634	eval-logloss:0.27431	eval-f1score:0.67383
[400]	train-logloss:0.20599	train-f1score:0.75042	eval-logloss:0.27441	eval-f1score:0.67409
[425]	train-logloss:0.20314	train-f1score:0.75541	eval-logloss:0.27465	eval-f1score:0.67616
[450]	train-logloss:0.20044	train-f1score:0.76005	eval-logloss:0.27492	eval-f1score:0.67785
[475]	train-logloss:0.19750	train-f1score:0.76618	eval-logloss:0.27535	eval-f1score:0.67642
[499]	train-logloss:0.19428	train-f1score:0.77115	eval-logloss:0.27567	eval-f1score:0.67642
--------------------------------------------------
xgboost training fold 6
[0]	train-logloss:0.65993	train-f1score:0.09575	eval-logloss:0.66045	eval-f1score:0.10380
[25]	train-logloss:0.33461	train-f1score:0.65760	eval-logloss:0.34560	eval-f1score:0.67139
[50]	train-logloss:0.27823	train-f1score:0.65831	eval-logloss:0.29859	eval-f1score:0.66623
[75]	train-logloss:0.26035	train-f1score:0.66924	eval-logloss:0.28997	eval-f1score:0.66500
[100]	trai

300:	learn: 0.2609396	test: 0.2758581	best: 0.2758176 (285)	total: 51.1s	remaining: 33.8s
325:	learn: 0.2598976	test: 0.2758254	best: 0.2757978 (307)	total: 55.5s	remaining: 29.6s
350:	learn: 0.2588078	test: 0.2755731	best: 0.2755549 (344)	total: 59.3s	remaining: 25.2s
375:	learn: 0.2577035	test: 0.2755412	best: 0.2754674 (361)	total: 1m 3s	remaining: 20.9s
400:	learn: 0.2566166	test: 0.2755649	best: 0.2754674 (361)	total: 1m 7s	remaining: 16.6s
425:	learn: 0.2553612	test: 0.2754788	best: 0.2753920 (416)	total: 1m 11s	remaining: 12.3s
450:	learn: 0.2542523	test: 0.2754843	best: 0.2753920 (416)	total: 1m 14s	remaining: 8.08s
475:	learn: 0.2531712	test: 0.2756253	best: 0.2753920 (416)	total: 1m 18s	remaining: 3.93s
499:	learn: 0.2521735	test: 0.2756954	best: 0.2753920 (416)	total: 1m 21s	remaining: 0us

bestTest = 0.2753919932
bestIteration = 416

Shrink model to first 417 iterations.
--------------------------------------------------
catboost training fold 4
0:	learn: 0.6444497	test: 0.

325:	learn: 0.2606703	test: 0.2777167	best: 0.2777132 (313)	total: 44.7s	remaining: 23.9s
350:	learn: 0.2594832	test: 0.2777758	best: 0.2776512 (338)	total: 48.3s	remaining: 20.5s
375:	learn: 0.2581863	test: 0.2776723	best: 0.2776442 (363)	total: 51.8s	remaining: 17.1s
400:	learn: 0.2570088	test: 0.2776607	best: 0.2775683 (382)	total: 55.3s	remaining: 13.7s
425:	learn: 0.2559004	test: 0.2775127	best: 0.2775127 (425)	total: 58.9s	remaining: 10.2s
450:	learn: 0.2548765	test: 0.2773170	best: 0.2773010 (446)	total: 1m 2s	remaining: 6.78s
475:	learn: 0.2538842	test: 0.2773151	best: 0.2772904 (458)	total: 1m 5s	remaining: 3.33s
499:	learn: 0.2529498	test: 0.2772857	best: 0.2772630 (496)	total: 1m 9s	remaining: 0us

bestTest = 0.2772629859
bestIteration = 496

Shrink model to first 497 iterations.
catboost our out of folds CV f1score is 0.6491279898713637


In [183]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds
def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [184]:
test_df = Predicting(test_df, features, categorical_features)

In [185]:

#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

In [186]:
'''
#後処理の定義、調和平均版 
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()): 
    train_df['pred_prob'] = 0 
    weight_sum = 0 
    for method in CFG.METHOD_LIST: 
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv') 
        train_df['pred_prob'] += CFG.model_weight_dict[method] / oof_df[f'{method}_prediction'] 
        weight_sum += CFG.model_weight_dict[method] 
    train_df['pred_prob'] = weight_sum / train_df['pred_prob'] 
    best_score = 0 
    best_v = 0 
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro') 
        if score > best_score: 
            best_score = score 
            best_v = v 
    print(best_score, best_v) 
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df
'''

"\n#後処理の定義、調和平均版 \ndef Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()): \n    train_df['pred_prob'] = 0 \n    weight_sum = 0 \n    for method in CFG.METHOD_LIST: \n        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv') \n        train_df['pred_prob'] += CFG.model_weight_dict[method] / oof_df[f'{method}_prediction'] \n        weight_sum += CFG.model_weight_dict[method] \n    train_df['pred_prob'] = weight_sum / train_df['pred_prob'] \n    best_score = 0 \n    best_v = 0 \n    for v in tqdm(np.arange(1000) / 1000):\n        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro') \n        if score > best_score: \n            best_score = score \n            best_v = v \n    print(best_score, best_v) \n    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)\n    return train_df, test_df\n"

In [187]:
#後処理
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6874220849277614 0.776


In [188]:
test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

特徴量の重要度を確認する方法

In [189]:
model = pickle.load(open(f'lightgbm_fold1_seed42_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
City_BankState,0.147059
DisbursementDate,0.101961
Term_NoEmp,0.086275
FranchiseCode_ApprovalDate,0.084314
Term_BankState,0.080392
ApprovalDate,0.080392
NoEmp_DisbursementDate,0.070588
UrbanRural,0.062745
NoEmp,0.05098
Term,0.033333


In [190]:
importance_df = importance_df.sort_values('importance', ascending=False)
index_list = importance_df.index.tolist()
print(index_list)

['City_BankState', 'DisbursementDate', 'Term_NoEmp', 'FranchiseCode_ApprovalDate', 'Term_BankState', 'ApprovalDate', 'NoEmp_DisbursementDate', 'UrbanRural', 'NoEmp', 'Term', 'LowDoc', 'City', 'RetainedJob', 'CreateJob', 'ApprovalFY', 'SBA_Appv', 'DisbursementGross', 'Bankraptcy_By_Year', 'RevLineCr', 'Sector', 'GrAppv', 'State', 'BankState', 'FranchiseCode']
