In [270]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
pd.set_option('display.max_columns',100)

In [271]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 3
    AUTHOR = 'naokisusami'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    model_weight_dict = {'lightgbm': 0.50, 'xgboost': 0.10, 'catboost': 0.40}


In [272]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

In [273]:
# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [274]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

In [275]:
#特徴量分類
default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 'ApprovalFY']
new_numerical_features = ['Unemployment_By_State','GDP_By_State','DisbursementMonth','DisbursementYear','ApprovalMonth','ApprovalYear',
                         'CompanyLong','BCI','ApprovalTerm','DisbursementTerm','DisbursementGrossPerMonth','SBA_Appv-DisbursementGross',
                          'AveSalary_By_State','EconomyGrowth_By_Year','Bankraptcy_By_Year','Unemploymentrate_By_Year']
default_categorical_features = ['NewExist', 'FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']
new_categorical_features = ['Cor_State','Area','State-City']
add_numerical_features = ['FranchiseCode_count_encoding', 'RevLineCr_count_encoding', 'LowDoc_count_encoding',
                          'UrbanRural_count_encoding', 'State_count_encoding', 'BankState_count_encoding',
                          'City_count_encoding', 'Sector_count_encoding','Area_count_encoding','State-City_count_encoding']
numerical_features = add_numerical_features + default_numerical_features + new_numerical_features
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector'] + ['NewExist','Cor_State','Area']
features = numerical_features + categorical_features

In [276]:
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            output_df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            output_df[col] = input_df[col].fillna('50-NaN-50')
        return output_df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            output_df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return output_df
    output_df = deal_missing(input_df)
    output_df = clean_money(output_df)
    output_df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df['IfDisbursementDateNaN'] = df['DisbursementDate'].apply(lambda x:1 if x == '50-NaN-50' else 0)
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementDate'] = df['DisbursementDate'].astype(int)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']
        df['ApprovalTerm'] = 15 - df['ApprovalFY']
        df['DisbursementTerm'] = 15 - df['DisbursementYear']

        #経済成長率
        EconomyGrowthdata={-26:-0.6,-25:-0.4,-24:5.6,-23:4.6,-22:5.5,-21:3.2,-20:-0.26,-19:2.54,-18:-1.8,-17:4.58,-16:7.24,-15:4.17,
                           -14:3.46,-13:3.46,-12:4.18,-11:3.67,-10:1.89,-9:-0.11,-8:3.52,-7:2.75,-6:4.03,-5:2.68,-4:3.77,-3:4.45,
                           -2:4.18,-1:4.8,0:4.08,1:0.95,2:1.7,3:2.8,4:3.85,5:3.48,6:2.78,7: 2.01,8:0.12,9:-2.6,10:2.71,11:1.55,12:2.28,
                           13:1.84,14:2.29,15:2.71,16:1.67,17:2.24,18:2.95,19:2.30}
        #Bankraptdataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}
        
        #失業率
        Unemploymentratedata={-26:5.45,-25:8.7,-24:7.7,-23:7.05,-22:6.05,-21:5.7,-20:7.7,-19:7.35,-18:9.7,-17:9.75,-16:7.35,-15:7.4,
                              -14:7.1,-13:6.15,-12:5.4,-11:5.25,-10:5.35,-9:6.85,-8:7.75,-7:6.95,-6:6.1,-5:5.65,-4:5.4,-3:4.95,-2:4.5,
                              -1:4.3,0:4.0,1:4.55,2:5.8,3:6.25,4:5.55,5:5.0,6:4.65,7:4.65,8:5.7,9:9.5,10:9.4,11:9.05,12:8.2,13:7.4,
                              14:6.15,15:5.25,16:4.9,17:4.3,18:3.9,19:3.6}
        #年ごとのデータを、1-5年後の平均に変換
        datalist = [EconomyGrowthdata,Bankraptcydata,Unemploymentratedata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
        
        df['EconomyGrowth_By_Year'] = df['DisbursementYear'].map(EconomyGrowthdata)
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)
        df['Unemploymentrate_By_Year'] = df['DisbursementYear'].map(Unemploymentratedata)
        
        
        
        
        
        
        
        
        #State関係の特徴量作成
        df['Cor_State'] = (df['State']==df['BankState']).astype(int)
        StateList = ['AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA',
                      'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX',
                      'UT','VT','VA','WA','WV','WI','WY']
        
        UnemploymentList = [2.6,3.7,4.0,3.4,4.1,2.8,4.0,4.6,4.2,2.7,3.1,3.7,2.8,4.6,3.1,3.0,2.9,3.9,3.5,3.1,3.0,3.7,4.3,2.9,4.0,
                          2.7,2.6,2.7,5.5,2.9,3.3,3.5,4.1,3.8,2.1,4.1,3.2,4.8,4.3,3.2,3.3,2.2,3.5,3.8,2.4,3.0,3.1
                            ,4.5,4.1,3.0,3.9]
        AreaList = ['SE','FW','SW','SE','FW','RM','NE','ME','ME','SE','SE','FW','RM','GL','GL','PL','PL','SE','SE','NE','ME',
                     'NE','GL','PL','SE','PL','RM','PL','FW','NE','ME','SW','ME','SE','PL','GL','SW','FW','ME','NE','SE','PL','SE',
                     'SW','RM','NE','SE','FW','SE','GL','RM']
        GDPList = [29603,44807,33655,27781,42376,40805,51911,56496,126421,33417,35265,38850,29843,39568,32724,35814,34770,30364,35181,
                   30282,39596,47351,32846,41353,24477,32590,28201,37075,40210,37375,45052,30943,49038,37053,34694,34040,29470,
                   38339,35153,36543,28894,35596,33742,37793,32774,34197,41617,40361,24929,34890,40303]
        
        GDPperPersonList = [37282,71008,48148,35674,53525,54943,63504,76720,164002,45958,48434,50788,39529,49083,40529,44091,43633,
                       38148,48366,37734,50729,55364,38433,51829,31127,41012,37966,46803,63662,46400,55320,41878,58126,49625,43172,
                       41073,40376,46248,43246,44738,38093,44955,42865,54766,47313,40312,54102,52810,31914,43309,63822]
        
        AveSalaryList = [40.46,50.81,45.40,37.79,56.10,49.79,60.14,49.66,79.85,43.66,46.17,44.09,36.45,51.71,40.97,38.39,40.96,
                        39.54,43.15,39.06,54.28,58.62,45.19,46.99,35.95,42.58,35.81,39.87,44.38,46.38,56.72,40.91,61.04,43.11,41.12,
                        43.45,40.75,43.46,46.10,46.38,39.63,35.00,41.88,48.35,41.11,39.54,52.07,51.04,38.48,41.46,44.03]
        
        Unemploymentdict = dict(zip(StateList,UnemploymentList))
        Areadict = dict(zip(StateList,AreaList))
        GDPdict = dict(zip(StateList,GDPList))
        GDPperPersondict = dict(zip(StateList,GDPperPersonList))
        AveSalarydict = dict(zip(StateList,AveSalaryList))
        
        df['Unemployment_By_State'] = df['State'].map(Unemploymentdict)
        df['Area'] = df['State'].map(Areadict)
        df['GDP_By_State'] = df['State'].map(GDPdict)
        df['GDPperPerson_By_State'] = df['State'].map(GDPperPersondict)
        df['AveSalary_By_State'] = df['State'].map(AveSalarydict)
        
        df['State-City'] = df['State']+df['City']
        
        
        
        
        

        
        
        #現状グループ分けされない特徴量の作成
        #企業の安定さ、デカさ
        df['BCI'] = df['CompanyLong']*(df['NoEmp'])*(df['NewExist']+1)
        df['BCI'] = df['BCI'].fillna(df['BCI'].mean)
        #一か月あたりの返済必要量
        df['DisbursementGrossPerMonth'] = df['DisbursementGross']/(df['Term']+1)
        #SBA承認より減らした額
        df['SBA_Appv-DisbursementGross'] = df['SBA_Appv']-df['DisbursementGross']
        
        

        
        
        return df
    output_df = make_features(output_df)
    return output_df

In [277]:
#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)
print(train_df)

       Term  NoEmp  NewExist  CreateJob  RetainedJob  FranchiseCode RevLineCr  \
0       163     21         1          0            0              1         N   
1        84      6         1          4            0              0         0   
2       242     45         1          4           90              0         N   
3       237      4         1          0            0              0         N   
4       184      0         1          0            0              0         N   
...     ...    ...       ...        ...          ...            ...       ...   
42302   283     14         1          0            0              1         N   
42303    53      2         1          0            0              0         Y   
42304    59      6         0          0            0              1         N   
42305   295     18         1          0            8              0         N   
42306    84      4         1          0            8              0         N   

      LowDoc  DisbursementD

（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [278]:
#カウントエンコーディング
for col in ['FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector','Area','State-City']:
    count_dict = dict(train_df[col].value_counts())
    train_df[f'{col}_count_encoding'] = train_df[col].map(count_dict)
    test_df[f'{col}_count_encoding'] = test_df[col].map(count_dict).fillna(1).astype(int)
#ラベルエンコーディング
for col in categorical_features:
    encoder = LabelEncoder()
    encoder.fit(train_df[col])
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [279]:
#OneHotEncoding
train_df2 = train_df.drop(['MIS_Status'],axis=1)
OneHotList = ['RevLineCr', 'LowDoc']
ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
train_df2 = ohe.fit_transform(train_df2)
test_df = ohe.transform(test_df)
train_df = pd.concat([train_df2,train_df['MIS_Status']],axis=1)

#featuresの作成
categorical_features = ['RevLineCr_0.0','RevLineCr_1.0','RevLineCr_2.0','RevLineCr_3.0','RevLineCr_4.0', 'LowDoc_0.0','LowDoc_1.0',
                        'LowDoc_2.0','LowDoc_3.0','LowDoc_4.0','LowDoc_5.0','LowDoc_6.0','UrbanRural','IfDisbursementDateNaN',
                        'State', 'Sector']
RemoveList=['DisbursementDate','City','ApprovalDate','ApprovalFY','BankState','FranchiseCode','State-City']
features = train_df.columns.tolist()
for i in RemoveList:
    features.remove(i)
features.remove('MIS_Status')

for i in RemoveList:
    train_df.drop([i],axis=1)
    test_df.drop([i],axis=1)
print(train_df)
print(features)

       Term  NoEmp  NewExist  CreateJob  RetainedJob  FranchiseCode  \
0       163     21         1          0            0              1   
1        84      6         1          4            0              0   
2       242     45         1          4           90              0   
3       237      4         1          0            0              0   
4       184      0         1          0            0              0   
...     ...    ...       ...        ...          ...            ...   
42302   283     14         1          0            0              1   
42303    53      2         1          0            0              0   
42304    59      6         0          0            0              1   
42305   295     18         1          0            8              0   
42306    84      4         1          0            8              0   

       RevLineCr_1.0  RevLineCr_0.0  RevLineCr_4.0  RevLineCr_3.0  \
0                  1              0              0              0   
1        

In [280]:
#lightgbmでの学習メソッドの定義
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

#xgboostでの学習メソッドの定義
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

#catboostでの学習メソッドの定義
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred
#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)

        # Save best model
        pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [281]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 32336, number of negative: 3927
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3264
[LightGBM] [Info] Number of data points in the train set: 36263, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.891708 -> initscore=2.108305
[LightGBM] [Info] Start training from score 2.108305
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[241]	training's auc: 0.934551	training's f1score: 0.702359	valid_1's auc: 0.759196	valid_1's f1score: 0.633928
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Number of positive: 32389, number of negative: 3874
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of test

[175]	train-logloss:0.22960	train-f1score:0.70936	eval-logloss:0.28018	eval-f1score:0.66719
[200]	train-logloss:0.22486	train-f1score:0.71491	eval-logloss:0.28055	eval-f1score:0.66846
[225]	train-logloss:0.22115	train-f1score:0.72064	eval-logloss:0.28105	eval-f1score:0.66934
[250]	train-logloss:0.21763	train-f1score:0.72631	eval-logloss:0.28147	eval-f1score:0.66796
[275]	train-logloss:0.21261	train-f1score:0.73207	eval-logloss:0.28213	eval-f1score:0.66834
[300]	train-logloss:0.20908	train-f1score:0.73997	eval-logloss:0.28261	eval-f1score:0.66697
[325]	train-logloss:0.20477	train-f1score:0.74804	eval-logloss:0.28290	eval-f1score:0.66722
[350]	train-logloss:0.20067	train-f1score:0.75599	eval-logloss:0.28327	eval-f1score:0.66634
[375]	train-logloss:0.19567	train-f1score:0.76574	eval-logloss:0.28350	eval-f1score:0.66834
[400]	train-logloss:0.19170	train-f1score:0.77532	eval-logloss:0.28399	eval-f1score:0.66897
[425]	train-logloss:0.18860	train-f1score:0.78118	eval-logloss:0.28441	eval-f1sc

[275]	train-logloss:0.21178	train-f1score:0.73535	eval-logloss:0.28044	eval-f1score:0.66216
[300]	train-logloss:0.20825	train-f1score:0.73983	eval-logloss:0.28065	eval-f1score:0.66182
[325]	train-logloss:0.20546	train-f1score:0.74594	eval-logloss:0.28080	eval-f1score:0.66094
[350]	train-logloss:0.20230	train-f1score:0.75145	eval-logloss:0.28116	eval-f1score:0.66021
[375]	train-logloss:0.19900	train-f1score:0.75798	eval-logloss:0.28169	eval-f1score:0.66021
[400]	train-logloss:0.19588	train-f1score:0.76303	eval-logloss:0.28179	eval-f1score:0.65957
[425]	train-logloss:0.19307	train-f1score:0.76856	eval-logloss:0.28180	eval-f1score:0.65957
[450]	train-logloss:0.18969	train-f1score:0.77682	eval-logloss:0.28212	eval-f1score:0.65868
[475]	train-logloss:0.18633	train-f1score:0.78476	eval-logloss:0.28251	eval-f1score:0.65909
[482]	train-logloss:0.18523	train-f1score:0.78860	eval-logloss:0.28265	eval-f1score:0.66085
xgboost our out of folds CV f1score is 0.6417437771980364
----------------------

300:	learn: 0.2567019	test: 0.2691333	best: 0.2691314 (294)	total: 21.2s	remaining: 14s
325:	learn: 0.2549830	test: 0.2690228	best: 0.2690228 (325)	total: 23s	remaining: 12.3s
350:	learn: 0.2532408	test: 0.2688211	best: 0.2688211 (350)	total: 24.7s	remaining: 10.5s
375:	learn: 0.2514437	test: 0.2687913	best: 0.2687586 (367)	total: 26.5s	remaining: 8.73s
400:	learn: 0.2498442	test: 0.2689218	best: 0.2687586 (367)	total: 28.2s	remaining: 6.96s
425:	learn: 0.2480275	test: 0.2687905	best: 0.2687496 (418)	total: 30s	remaining: 5.21s
450:	learn: 0.2461752	test: 0.2689252	best: 0.2687496 (418)	total: 32s	remaining: 3.48s
475:	learn: 0.2446963	test: 0.2688803	best: 0.2687496 (418)	total: 34.1s	remaining: 1.72s
499:	learn: 0.2428746	test: 0.2688689	best: 0.2687496 (418)	total: 36s	remaining: 0us

bestTest = 0.2687495992
bestIteration = 418

Shrink model to first 419 iterations.
--------------------------------------------------
catboost training fold 5
0:	learn: 0.6466032	test: 0.6463005	best: 

In [282]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds
def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [283]:
test_df = Predicting(test_df, features, categorical_features)

In [284]:
#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

In [285]:
#後処理
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6834636412074487 0.738


In [286]:
test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

特徴量の重要度を確認する方法

In [287]:
model = pickle.load(open(f'lightgbm_fold1_seed42_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
State,0.097234
Term,0.054219
SBA_Appv-DisbursementGross,0.044952
DisbursementGrossPerMonth,0.044952
NoEmp,0.043983
BCI,0.0426
SBA_Appv,0.039557
ApprovalDay,0.038728
Sector,0.035685
ApprovalMonth,0.029322
