In [15]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
# from keras.layers import Dense, Dropout
# from keras.models import Sequential
# from keras.callbacks import EarlyStopping
# from keras.optimizers import Adam
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import torch.optim as optim
from sklearn.svm import SVC

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 11
    AUTHOR = 'Yuta.K'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = [ 'svm', 'adaboost','lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    classification_svm_params = {
        'C': 1.0,  # 正則化パラメータ
        'kernel': 'rbf',  # カーネルの種類
        'gamma': 'scale',  # カーネル係数
        'class_weight': 'balanced',  # クラスの不均衡に対応
        'probability': True,  # 確率推定を有効化
        'random_state': seed,  # 乱数シード
    }
    model_weight_dict = {'lightgbm': 0.35, 'xgboost': 0.10, 'catboost': 0.35, 'adaboost': 0.10, 'svm': 0.10}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [2]:
import sklearn
print(sklearn.__version__)

1.4.0


In [3]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']

In [4]:
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        
        return df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    df = deal_missing(input_df)
    df = clean_money(df)
    df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']

        #Bankraptcydataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}

        #年ごとのデータを、1-5年後の平均に変換
        datalist = [Bankraptcydata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = k[-26]*2
        
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)

        #組み合わせ特徴量
        df['State_Sector'] = df['State'].astype(str) + '_' + df['Sector'].astype(str)
         # 地理的特徴の組み合わせ
        df['City_State'] = df['City'] + '_' + df['State']
        # 時間的特徴の組み合わせ
        df['ApprovalFY_Term'] = df['ApprovalFY'].astype(str) + '_' + df['Term'].astype(str)
        
        df['FranchiseCode_ApprovalDate'] = df['FranchiseCode'].astype(str) + '_' + df['ApprovalDate'].astype(str)
        
        df['Term_NoEmp'] = df['Term'].astype(str) + '_' + df['NoEmp'].astype(str)
        
        df['City_BankState'] = df['City'].astype(str) + '_' + df['BankState'].astype(str)
        
        df['NoEmp_SBA_Appv'] = df['NoEmp'].astype(str) + '_' + df['SBA_Appv'].astype(str)
        
        #特徴量の加工
        #lowdoc ['LowDoc_Y', 'LowDoc_S', 'LowDoc_N', 'LowDoc_A', 'LowDoc_C', 'LowDoc_0', 'LowDoc_UNK']
        df['LowDoc_Y'] = (df['LowDoc'] == 'Y').astype(int)
        df['LowDoc_S'] = (df['LowDoc'] == 'S').astype(int)
        df['LowDoc_N'] = (df['LowDoc'] == 'N').astype(int)
        df['LowDoc_C'] = (df['LowDoc'] == 'C').astype(int)
        df['LowDoc_A'] = (df['LowDoc'] == 'A').astype(int)
        df['LowDoc_0'] = (df['LowDoc'] == '0').astype(int)
        df['LowDoc_UNK'] = (df['LowDoc'] == 'UNK').astype(int)

        #RevLineCr ['RevLineCr_Y', 'RevLineCr_T', 'RevLineCr_N', 'RevLineCr_0', 'RevLineCr_UNK']
        df['RevLineCr_Y'] = (df['RevLineCr'] == 'Y').astype(int)
        df['RevLineCr_T'] = (df['RevLineCr'] == 'T').astype(int)
        df['RevLineCr_N'] = (df['RevLineCr'] == 'N').astype(int)
        df['RevLineCr_0'] = (df['RevLineCr'] == '0').astype(int)
        df['RevLineCr_UNK'] = (df['RevLineCr'] == 'UNK').astype(int)


        
        return df
    df = make_features(df)
    return df

In [5]:
#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Term                        42307 non-null  int64  
 1   NoEmp                       42307 non-null  int64  
 2   NewExist                    42307 non-null  int32  
 3   CreateJob                   42307 non-null  int64  
 4   RetainedJob                 42307 non-null  int64  
 5   FranchiseCode               42307 non-null  int64  
 6   RevLineCr                   42307 non-null  object 
 7   LowDoc                      42307 non-null  object 
 8   DisbursementDate            42307 non-null  object 
 9   MIS_Status                  42307 non-null  int64  
 10  Sector                      42307 non-null  int64  
 11  ApprovalDate                42307 non-null  object 
 12  ApprovalFY                  42307 non-null  int64  
 13  City                        423

（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [7]:
#ラベルエンコーディング
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
categorical_features_unlabelable = ['ApprovalFY_Term','City_State','City','ApprovalDate','BankState','DisbursementDate','State_Sector',
                                   'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']
'''
for col in categorical_features_unlabelable:
    le = LabelEncoder()   
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))
'''
for col in categorical_features_unlabelable:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Term                        42307 non-null  int64  
 1   NoEmp                       42307 non-null  int64  
 2   NewExist                    42307 non-null  int32  
 3   CreateJob                   42307 non-null  int64  
 4   RetainedJob                 42307 non-null  int64  
 5   FranchiseCode               42307 non-null  int64  
 6   RevLineCr                   42307 non-null  int32  
 7   LowDoc                      42307 non-null  int32  
 8   DisbursementDate            42307 non-null  int32  
 9   MIS_Status                  42307 non-null  int64  
 10  Sector                      42307 non-null  int64  
 11  ApprovalDate                42307 non-null  int32  
 12  ApprovalFY                  42307 non-null  int64  
 13  City                        423

In [9]:
#featuresの作成
categorical_features = ['State', 'Sector','RevLineCr_Y', 'RevLineCr_T', 'RevLineCr_N', 'RevLineCr_0', 'RevLineCr_UNK',
                       'LowDoc_Y', 'LowDoc_S', 'LowDoc_N', 'LowDoc_A', 'LowDoc_C', 'LowDoc_0', 'LowDoc_UNK',
                       'ApprovalFY_Term','City_State','City','ApprovalDate','BankState','State_Sector','UrbanRural',
                        'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']


RemoveList=['MIS_Status']
features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)

MIS_Status


In [10]:
print(train_df)
print(features)

       Term  NoEmp  NewExist  CreateJob  RetainedJob  FranchiseCode  \
0       163     21         1          0            0              1   
1        84      6         1          4            0              0   
2       242     45         1          4           90              0   
3       237      4         1          0            0              0   
4       184      0         1          0            0              0   
...     ...    ...       ...        ...          ...            ...   
42302   283     14         1          0            0              1   
42303    53      2         1          0            0              0   
42304    59      6         0          0            0              1   
42305   295     18         1          0            8              0   
42306    84      4         1          0            8              0   

       RevLineCr  LowDoc  DisbursementDate  MIS_Status  Sector  ApprovalDate  \
0              1       3               847           1       0     

In [11]:
# AdaBoost training
def adaboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = AdaBoostClassifier(**CFG.classification_adaboost_params)
    model.fit(x_train, y_train)
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

#svm training
def svm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = SVC(**CFG.classification_svm_params, probability=True)  # SVMモデルの初期化
    model.fit(x_train[features], y_train)  # モデルのトレーニング
    valid_pred = model.predict_proba(x_valid[features])[:, 1]  # 予測確率の取得
    return model, valid_pred
    
#lightgbmでの学習メソッドの定義
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

#xgboostでの学習メソッドの定義
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

#catboostでの学習メソッドの定義
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

    
                    
#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        
        model = None  # モデル変数を初期化する
        valid_pred = None
        
        if method == 'adaboost':
            model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'svm':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)        
        # Save best model
        pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [12]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
svm training fold 1
0:	learn: 0.6456761	test: 0.6453134	best: 0.6453134 (0)	total: 388ms	remaining: 3m 13s
25:	learn: 0.3095616	test: 0.3061577	best: 0.3061577 (25)	total: 5.74s	remaining: 1m 44s
50:	learn: 0.2814975	test: 0.2788637	best: 0.2788637 (50)	total: 11.4s	remaining: 1m 40s
75:	learn: 0.2753197	test: 0.2744277	best: 0.2744277 (75)	total: 17.2s	remaining: 1m 35s
100:	learn: 0.2721891	test: 0.2736425	best: 0.2736312 (98)	total: 23.1s	remaining: 1m 31s
125:	learn: 0.2700161	test: 0.2732864	best: 0.2732772 (124)	total: 29s	remaining: 1m 26s
150:	learn: 0.2681926	test: 0.2732907	best: 0.2731864 (134)	total: 35s	remaining: 1m 20s
175:	learn: 0.2665974	test: 0.2730666	best: 0.2730666 (175)	total: 40.9s	remaining: 1m 15s
200:	learn: 0.2650296	test: 0.2728972	best: 0.2728565 (192)	total: 46.7s	remaining: 1m 9s
225:	learn: 0.2634675	test: 0.2729650	best: 0.2728565 (192)	total: 52.8s	remaining: 1m 3s
250:	learn: 0.2620841	test: 0.272925

0:	learn: 0.6456577	test: 0.6456816	best: 0.6456816 (0)	total: 200ms	remaining: 1m 39s
25:	learn: 0.3095380	test: 0.3067426	best: 0.3067426 (25)	total: 5.83s	remaining: 1m 46s
50:	learn: 0.2816177	test: 0.2783000	best: 0.2783000 (50)	total: 11.7s	remaining: 1m 42s
75:	learn: 0.2760130	test: 0.2733381	best: 0.2733381 (75)	total: 17.6s	remaining: 1m 38s
100:	learn: 0.2730668	test: 0.2717910	best: 0.2717910 (100)	total: 24.3s	remaining: 1m 36s
125:	learn: 0.2708704	test: 0.2709260	best: 0.2709260 (125)	total: 30.8s	remaining: 1m 31s
150:	learn: 0.2691899	test: 0.2704958	best: 0.2704888 (144)	total: 37.6s	remaining: 1m 26s
175:	learn: 0.2675075	test: 0.2699249	best: 0.2699249 (175)	total: 43.9s	remaining: 1m 20s
200:	learn: 0.2660038	test: 0.2697769	best: 0.2697749 (198)	total: 50.6s	remaining: 1m 15s
225:	learn: 0.2646190	test: 0.2696320	best: 0.2696128 (222)	total: 56.8s	remaining: 1m 8s
250:	learn: 0.2631188	test: 0.2692966	best: 0.2692849 (249)	total: 1m 3s	remaining: 1m 3s
275:	learn:

--------------------------------------------------
lightgbm training fold 3
[LightGBM] [Info] Number of positive: 32375, number of negative: 3888
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19152
[LightGBM] [Info] Number of data points in the train set: 36263, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892783 -> initscore=2.119492
[LightGBM] [Info] Start training from score 2.119492
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[28]	training's auc: 0.894136	training's f1score: 0.594607	valid_1's auc: 0.768109	valid_1's f1score: 0.56359
--------------------------------------------------
lightgbm training fold 4
[LightGBM] [Info] Number of positive: 32353, number of negative: 3910
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19120
[LightGBM] [Info] Number o

[25]	train-logloss:0.28467	train-f1score:0.64549	eval-logloss:0.29346	eval-f1score:0.62679
[50]	train-logloss:0.26409	train-f1score:0.66882	eval-logloss:0.28289	eval-f1score:0.64633
[75]	train-logloss:0.25214	train-f1score:0.68283	eval-logloss:0.27990	eval-f1score:0.65807
[100]	train-logloss:0.24467	train-f1score:0.69359	eval-logloss:0.27927	eval-f1score:0.65911
[125]	train-logloss:0.23883	train-f1score:0.70022	eval-logloss:0.27914	eval-f1score:0.66267
[150]	train-logloss:0.23312	train-f1score:0.70522	eval-logloss:0.27908	eval-f1score:0.66438
[175]	train-logloss:0.22733	train-f1score:0.71245	eval-logloss:0.27919	eval-f1score:0.66517
[200]	train-logloss:0.22264	train-f1score:0.71676	eval-logloss:0.27966	eval-f1score:0.66428
[225]	train-logloss:0.21858	train-f1score:0.72326	eval-logloss:0.28029	eval-f1score:0.66176
[250]	train-logloss:0.21450	train-f1score:0.72956	eval-logloss:0.28104	eval-f1score:0.66103
[275]	train-logloss:0.20951	train-f1score:0.73862	eval-logloss:0.28146	eval-f1score

[400]	train-logloss:0.19083	train-f1score:0.77349	eval-logloss:0.28332	eval-f1score:0.66699
[425]	train-logloss:0.18762	train-f1score:0.77718	eval-logloss:0.28369	eval-f1score:0.66699
[450]	train-logloss:0.18385	train-f1score:0.78486	eval-logloss:0.28427	eval-f1score:0.66724
[466]	train-logloss:0.18161	train-f1score:0.79040	eval-logloss:0.28451	eval-f1score:0.66651
xgboost our out of folds CV f1score is 0.6390550622619808
--------------------------------------------------
catboost training fold 1
0:	learn: 0.6456761	test: 0.6453134	best: 0.6453134 (0)	total: 187ms	remaining: 1m 33s
25:	learn: 0.3095616	test: 0.3061577	best: 0.3061577 (25)	total: 5.45s	remaining: 1m 39s
50:	learn: 0.2814975	test: 0.2788637	best: 0.2788637 (50)	total: 12s	remaining: 1m 45s
75:	learn: 0.2753197	test: 0.2744277	best: 0.2744277 (75)	total: 18.2s	remaining: 1m 41s
100:	learn: 0.2721891	test: 0.2736425	best: 0.2736312 (98)	total: 26.6s	remaining: 1m 44s
125:	learn: 0.2700161	test: 0.2732864	best: 0.2732772 (1

450:	learn: 0.2525924	test: 0.2668196	best: 0.2665974 (410)	total: 1m 50s	remaining: 12s
475:	learn: 0.2512358	test: 0.2667908	best: 0.2665974 (410)	total: 1m 56s	remaining: 5.86s
499:	learn: 0.2500888	test: 0.2667204	best: 0.2665974 (410)	total: 2m 2s	remaining: 0us

bestTest = 0.2665974308
bestIteration = 410

Shrink model to first 411 iterations.
--------------------------------------------------
catboost training fold 5
0:	learn: 0.6456577	test: 0.6456816	best: 0.6456816 (0)	total: 176ms	remaining: 1m 27s
25:	learn: 0.3095380	test: 0.3067426	best: 0.3067426 (25)	total: 5.57s	remaining: 1m 41s
50:	learn: 0.2816177	test: 0.2783000	best: 0.2783000 (50)	total: 11.2s	remaining: 1m 38s
75:	learn: 0.2760130	test: 0.2733381	best: 0.2733381 (75)	total: 17.1s	remaining: 1m 35s
100:	learn: 0.2730668	test: 0.2717910	best: 0.2717910 (100)	total: 23.2s	remaining: 1m 31s
125:	learn: 0.2708704	test: 0.2709260	best: 0.2709260 (125)	total: 28.3s	remaining: 1m 24s
150:	learn: 0.2691899	test: 0.270495

In [13]:
def adaboost_inference(x_test):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def svm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'svm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds

def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'adaboost':
        test_pred = adaboost_inference(x_test)
    if method == 'svm':
        test_pred = svm_inference(x_test)
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [16]:
test_df = Predicting(test_df, features, categorical_features)

In [17]:
#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

In [18]:
'''
#後処理の定義、調和平均版 
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()): 
    train_df['pred_prob'] = 0 
    weight_sum = 0 
    for method in CFG.METHOD_LIST: 
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv') 
        train_df['pred_prob'] += CFG.model_weight_dict[method] / oof_df[f'{method}_prediction'] 
        weight_sum += CFG.model_weight_dict[method] 
    train_df['pred_prob'] = weight_sum / train_df['pred_prob'] 
    best_score = 0 
    best_v = 0 
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro') 
        if score > best_score: 
            best_score = score 
            best_v = v 
    print(best_score, best_v) 
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df
'''
pass

In [19]:
#後処理
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6880063673842517 0.737


In [20]:
test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

特徴量の重要度を確認する方法

In [21]:
model = pickle.load(open(f'lightgbm_fold1_seed42_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
City_BankState,0.139286
State_Sector,0.105952
ApprovalFY_Term,0.095238
Term_NoEmp,0.095238
NoEmp_SBA_Appv,0.086905
ApprovalDate,0.079762
FranchiseCode_ApprovalDate,0.07619
UrbanRural,0.044048
NoEmp,0.035714
LowDoc_S,0.025


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# OOF予測を基に新たな特徴量を作成
oof_features = np.zeros((train_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    oof_features[:, i] = oof_df[f'{method}_prediction']

# テストデータの予測を基に特徴量を作成
test_features = np.zeros((test_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    test_features[:, i] = test_df[f'{method}_pred_prob']

# 特徴量の標準化
scaler = StandardScaler()
oof_features_scaled = scaler.fit_transform(oof_features)
test_features_scaled = scaler.transform(test_features)

# ロジスティック回帰モデルを学習
lr = LogisticRegression()
lr.fit(oof_features_scaled, train_df[CFG.target_col])

# ランダムフォレストモデルを学習
rf = RandomForestClassifier()
rf.fit(oof_features_scaled, train_df[CFG.target_col])

# XGBoostモデルを学習
xgb = XGBClassifier()
xgb.fit(oof_features_scaled, train_df[CFG.target_col])

# LightGBMモデルを学習
lgbm = LGBMClassifier()
lgbm.fit(oof_features_scaled, train_df[CFG.target_col])

# 最適な閾値とその時のF1スコアを探索する関数
def find_best_threshold_and_score(y_true, y_pred_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.linspace(0, 1, 1001):  # 0.001刻みで閾値を変更
        score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# ロジスティック回帰モデルの学習データに対する予測確率
train_pred_proba_lr = lr.predict_proba(oof_features_scaled)[:, 1]

# ランダムフォレストモデルの学習データに対する予測確率
train_pred_proba_rf = rf.predict_proba(oof_features_scaled)[:, 1]

# XGBoostモデルの学習データに対する予測確率
train_pred_proba_xgb = xgb.predict_proba(oof_features_scaled)[:, 1]

# LightGBMモデルの学習データに対する予測確率
train_pred_proba_lgbm = lgbm.predict_proba(oof_features_scaled)[:, 1]

# 最適な閾値とスコアを求める
best_threshold_lr, best_score_lr = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_lr)
best_threshold_rf, best_score_rf = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_rf)
best_threshold_xgb, best_score_xgb = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_xgb)
best_threshold_lgbm, best_score_lgbm = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_lgbm)

print(f'LR Best Threshold: {best_threshold_lr}, Best F1 Score: {best_score_lr}')
print(f'RF Best Threshold: {best_threshold_rf}, Best F1 Score: {best_score_rf}')
print(f'XGB Best Threshold: {best_threshold_xgb}, Best F1 Score: {best_score_xgb}')
print(f'LGBM Best Threshold: {best_threshold_lgbm}, Best F1 Score: {best_score_lgbm}')

# テストデータに対する最終予測
test_pred_proba_lr = lr.predict_proba(test_features_scaled)[:, 1]
test_pred_proba_rf = rf.predict_proba(test_features_scaled)[:, 1]
test_pred_proba_xgb = xgb.predict_proba(test_features_scaled)[:, 1]
test_pred_proba_lgbm = lgbm.predict_proba(test_features_scaled)[:, 1]

test_final_predictions_lr = (test_pred_proba_lr >= best_threshold_lr).astype(int)
test_final_predictions_rf = (test_pred_proba_rf >= best_threshold_rf).astype(int)
test_final_predictions_xgb = (test_pred_proba_xgb >= best_threshold_xgb).astype(int)
test_final_predictions_lgbm = (test_pred_proba_lgbm >= best_threshold_lgbm).astype(int)

# 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
submission_df_lr = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_lr}).reset_index(drop=True)
submission_df_rf = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_rf}).reset_index(drop=True)
submission_df_xgb = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_xgb}).reset_index(drop=True)
submission_df_lgbm = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_lgbm}).reset_index(drop=True)

submission_df_lr['Id'] = submission_df_lr.index + 42307
submission_df_rf['Id'] = submission_df_rf.index + 42307
submission_df_xgb['Id'] = submission_df_xgb.index + 42307
submission_df_lgbm['Id'] = submission_df_lgbm.index + 42307

submission_df_lr.to_csv(f'stacking_lr_submission_best_score{best_score_lr:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
submission_df_rf.to_csv(f'stacking_rf_submission_best_score{best_score_rf:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
submission_df_xgb.to_csv(f'stacking_xgb_submission_best_score{best_score_xgb:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
submission_df_lgbm.to_csv(f'stacking_lgbm_submission_best_score{best_score_lgbm:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)

LR Best Threshold: 0.809, Best F1 Score: 0.6883853294761878
RF Best Threshold: 0.531, Best F1 Score: 1.0
XGB Best Threshold: 0.781, Best F1 Score: 0.7560417531491034
LGBM Best Threshold: 0.811, Best F1 Score: 0.7193869591341475
