In [3]:
# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model


pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 14
    AUTHOR = 'Yuta.K'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = [ 'adaboost','lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }
    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    model_weight_dict = {'adaboost': 0.10,'lightgbm': 0.25, 'xgboost': 0.10, 'catboost': 0.25}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [2]:
!pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp39-cp39-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.16.1
  Downloading tensorflow_intel-2.16.1-cp39-cp39-win_amd64.whl (376.9 MB)
     -------------------------------------- 376.9/376.9 MB 1.8 MB/s eta 0:00:00
Collecting keras>=3.0.0
  Downloading keras-3.2.1-py3-none-any.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 2.7 MB/s eta 0:00:00
Collecting ml-dtypes~=0.3.1
  Downloading ml_dtypes-0.3.2-cp39-cp39-win_amd64.whl (127 kB)
     -------------------------------------- 127.7/127.7 kB 3.8 MB/s eta 0:00:00
Collecting tensorboard<2.17,>=2.16
  Downloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
     ---------------------------------------- 5.5/5.5 MB 4.1 MB/s eta 0:00:00
Collecting h5py>=3.10.0
  Downloading h5py-3.11.0-cp39-cp39-win_amd64.whl (3.0 MB)
     ---------------------------------------- 3.0/3.0 MB 4.4 MB/s eta 0:00:00
Collecting numpy<2.0.0,>=1.23.5
  Using cached numpy-1.26.4-cp39-c

ERROR: Could not install packages due to an OSError: [WinError 5] アクセスが拒否されました。: 'C:\\Users\\yutak\\anaconda3\\Lib\\site-packages\\~=mpy\\.libs\\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Installing collected packages: typing-extensions, numpy, optree, ml-dtypes, h5py, tensorboard, keras, tensorflow-intel, tensorflow
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.2
    Uninstalling typing-extensions-3.10.0.2:
      Successfully uninstalled typing-extensions-3.10.0.2
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4


In [None]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

In [None]:
def Preprocessing(train_df, test_df):
    #欠損値に対する前処理
    def deal_missing(input_df):
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        return df

    #金額に対する前処理
    def clean_money(input_df):
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    
    #特徴量作成
    def make_features(input_df):
        df = input_df.copy()
        df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']
        #破産した米国企業の数を外部データとして入力
        #Bankraptcydataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                        -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}
        #年毎のデータを、1-5年後の平均に変換
        datalist = [Bankraptcydata]#年毎の外部データの名前はここに入れる
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = k[-26]*2

        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)

        #組み合わせ特徴量
        df['State_Sector'] = df['State'].astype(str) + '_' + df['Sector'].astype(str)
        df['City_State'] = df['City'] + '_' + df['State']
        df['ApprovalFY_Term'] = df['ApprovalFY'].astype(str) + '_' + df['Term'].astype(str)
        df['FranchiseCode_ApprovalDate'] = df['FranchiseCode'].astype(str) + '_' + df['ApprovalDate'].astype(str)
        df['Term_NoEmp'] = df['Term'].astype(str) + '_' + df['NoEmp'].astype(str)
        df['City_BankState'] = df['City'].astype(str) + '_' + df['BankState'].astype(str)
        df['NoEmp_SBA_Appv'] = df['NoEmp'].astype(str) + '_' + df['SBA_Appv'].astype(str)
        return df


    train_df = deal_missing(train_df)
    test_df = deal_missing(test_df)
    train_df = clean_money(train_df)
    test_df = clean_money(test_df)
    train_df = make_features(train_df)
    test_df = make_features(test_df)
    return train_df,test_df

In [None]:
#前処理の実行
train_df, test_df = Preprocessing(train_df,test_df)

（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [None]:
#ラベルエンコーディング
categorical_features = ['UrbanRural', 'State', 'Sector']
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
#テストデータにしかないカテゴリが存在するものは以下でラベルエンコーディングする
categorical_features_unlabelable = ['ApprovalFY_Term','City_State','City','ApprovalDate','BankState','DisbursementDate','State_Sector',
                                   'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']
for col in categorical_features_unlabelable:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])
    
#ワンホットエンコーディング
OneHotList = ['RevLineCr', 'LowDoc']
train_df2 = train_df.drop(['MIS_Status'],axis=1)
ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
train_df2 = ohe.fit_transform(train_df2)
test_df = ohe.transform(test_df)
train_df = pd.concat([train_df2,train_df['MIS_Status']],axis=1)

In [None]:
#カテゴリカル特徴量の指定
categorical_features = ['State', 'Sector','RevLineCr_Y', 'RevLineCr_T', 'RevLineCr_N', 'RevLineCr_0', 'RevLineCr_UNK',
                       'LowDoc_Y', 'LowDoc_S', 'LowDoc_N', 'LowDoc_A', 'LowDoc_C', 'LowDoc_0', 'LowDoc_UNK',
                       'ApprovalFY_Term','City_State','City','ApprovalDate','BankState','State_Sector','UrbanRural',
                        'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']

#特徴量の指定
features = train_df.columns.tolist()
#学習に使用しない特徴量は以下で除外
RemoveList=['MIS_Status','ApprovalYear']
for i in RemoveList:
    features.remove(i)
print(f'features for training:{features}')

In [None]:
#Learning & Predicting

#1段階目の学習
def Pre_Learning(train_df, test_df, features, categorical_features):
    
    #adaboostでの学習メソッドの定義
    def adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        model = AdaBoostClassifier(**CFG.classification_adaboost_params)
        model.fit(x_train, y_train)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #lightgbmでの学習メソッドの定義
    def lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
        model = lgb.train(
                    params = CFG.classification_lgb_params,
                    train_set = lgb_train,
                    num_boost_round = CFG.num_boost_round,
                    valid_sets = [lgb_train, lgb_valid],
                    feval = lgb_metric,
                    callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                                  verbose=CFG.verbose)]
                )
        # Predict validation
        valid_pred = model.predict(x_valid)
        return model, valid_pred

    #xgboostでの学習メソッドの定義
    def xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
        model = xgb.train(
                    CFG.classification_xgb_params,
                    dtrain = xgb_train,
                    num_boost_round = CFG.num_boost_round,
                    evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                    early_stopping_rounds = CFG.early_stopping_round,
                    verbose_eval = CFG.verbose,
                    feval = xgb_metric,
                    maximize = CFG.metric_maximize_flag,
                )
        # Predict validation
        valid_pred = model.predict(xgb.DMatrix(x_valid))
        return model, valid_pred

    #catboostでの学習メソッドの定義
    def catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
        cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
        model = CatBoostClassifier(**CFG.classification_cat_params)
        model.fit(cat_train,
                  eval_set = [cat_valid],
                  early_stopping_rounds = CFG.early_stopping_round,
                  verbose = CFG.verbose,
                  use_best_model = True)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred



    #任意のモデルでのクロスバリデーション学習メソッドの定義
    def gradient_boosting_model_cv_training(method, train_df, features, categorical_features):
        # Create a numpy array to store out of folds predictions
        oof_predictions = np.zeros(len(train_df))
        oof_fold = np.zeros(len(train_df))
        kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
        for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
            print('-'*50)
            print(f'{method} training fold {fold+1}')

            x_train = train_df[features].iloc[train_index]
            y_train = train_df[CFG.target_col].iloc[train_index]
            x_valid = train_df[features].iloc[valid_index]
            y_valid = train_df[CFG.target_col].iloc[valid_index]

            model = None  # モデル変数を初期化する
            valid_pred = None

            if method == 'adaboost':
                model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'lightgbm':
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'xgboost':
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'catboost':
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)  
            # Save best model
            pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
            # Add to out of folds array
            oof_predictions[valid_index] = valid_pred
            oof_fold[valid_index] = fold + 1
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Compute out of folds metric
        score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
        print(f'{method} our out of folds CV f1score is {score}')
        # Create a dataframe to store out of folds predictions
        oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
        oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

    #adaboostの学習済みモデル読み込み関数
    def adaboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #lightgbmの学習モデル読み込み関数
    def lightgbm_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(x_test)
            test_pred += pred
        return test_pred / CFG.n_folds

    #xgboostの学習モデル読み込み関数
    def xgboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(xgb.DMatrix(x_test))
            test_pred += pred
        return test_pred / CFG.n_folds

    #catboostの学習モデル読み込み関数
    def catboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #任意のメソッドに対して予測を返す関数
    def gradient_boosting_model_inference(method, test_df, features, categorical_features):
        x_test = test_df[features]
        if method == 'adaboost':
            test_pred = adaboost_inference(x_test)
        if method == 'lightgbm':
            test_pred = lightgbm_inference(x_test)
        if method == 'xgboost':
            test_pred = xgboost_inference(x_test)
        if method == 'catboost':
            test_pred = catboost_inference(x_test)
        return test_pred

    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, train_df, features, categorical_features)
        test_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)

#2段階目の学習　ニューラルネットワークによるスタッキング
def Post_Learning(train_df,test_df):
    #ニューラルネットワークモデル作成関数
    def create_nn_model(input_shape):
        model = Sequential([
            Dense(64, input_shape=(input_shape,)),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(32),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(1, activation='sigmoid')
        ])
        optimizer = Adam(lr=0.001)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model

    #ニューラルネットワーク用学習スケジューラー
    def scheduler(epoch, lr):
            if epoch < 10:
                return lr
            else:
                return lr * np.exp(-0.1)

    #特徴量同士で積を作る関数
    def create_interaction_features(features):
            n_features = features.shape[1]
            interaction_features = []
            for i in range(n_features):
                for j in range(i + 1, n_features):
                    interaction_features.append(features[:, i] * features[:, j])  
            return np.column_stack(interaction_features)
    
    # OOF予測を基に新たな特徴量を作成
    oof_features = np.zeros((train_df.shape[0], len(CFG.METHOD_LIST)))
    for i, method in enumerate(CFG.METHOD_LIST):
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        oof_features[:, i] = oof_df[f'{method}_prediction']
    
    # テストデータの予測を基に特徴量を作成
    test_features = np.zeros((test_df.shape[0], len(CFG.METHOD_LIST)))
    for i, method in enumerate(CFG.METHOD_LIST):
        test_features[:, i] = test_df[f'{method}_pred_prob']

    # 特徴量同士の積を追加
    oof_interaction_features = create_interaction_features(oof_features)
    test_interaction_features = create_interaction_features(test_features)

    # 元の特徴量と相互作用特徴量を組み合わせ
    oof_combined_features = np.hstack([oof_features, oof_interaction_features])
    test_combined_features = np.hstack([test_features, test_interaction_features])

    # 特徴量の標準化
    scaler = StandardScaler()
    oof_combined_features_scaled = scaler.fit_transform(oof_combined_features)
    test_combined_features_scaled = scaler.transform(test_combined_features)   
    
    # ニューラルネットワークモデルを学習
    nn_model = create_nn_model(oof_combined_features_scaled.shape[1])
    callbacks_list = [LearningRateScheduler(scheduler)]
    nn_model.fit(oof_combined_features_scaled, train_df[CFG.target_col],
                 validation_split=0.2, epochs=50, batch_size=32, callbacks=callbacks_list, verbose=1)
    nn_model.save(f'nn_stacking_model_seed{CFG.seed}_ver{CFG.VER}.h5')
    
    #ロジスティック回帰モデルを学習
    lr_model = LogisticRegression()
    lr_model.fit(oof_combined_features_scaled, train_df[CFG.target_col])
    pickle.dump(lr_model, open(f'lr_stacking_model_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
    

def Learning_and_Predicting(train_df, test_df, features, categorical_features):
    Pre_Learning(train_df, features, categorical_features)
    Post_Learning(train_df,test_df)

In [None]:
#学習の実行
Learning_and_Predicting(train_df, test_df, features, categorical_features)

In [None]:
#Postprocessing

def Postprocessing():
    #最適な閾値を見つける関数
    def find_best_threshold_and_score(y_true, y_pred_proba):
        best_threshold = 0
        best_score = 0
        for threshold in np.linspace(0, 1, 1001):
            score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
            if score > best_score:
                best_score = score
                best_threshold = threshold
        return best_threshold, best_score
    
    # ニューラルネットワークモデルの学習データに対する予測確率
    nn_model = load_model(f'nn_stacking_model_seed{CFG.seed}_ver{CFG.VER}.h5')
    train_pred_proba_nn = nn_model.predict(train_df,oof_combined_features_scaled).flatten()
    
    # 最適な閾値とスコアを求める
    best_threshold_nn, best_score_nn = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_nn)
    print(f'NN Best Threshold: {best_threshold_nn}, Best F1 Score: {best_score_nn}')
    
    # テストデータに対する最終予測
    test_pred_proba_nn = nn_model.predict(test_combined_features_scaled).flatten()
    test_final_predictions_nn = (test_pred_proba_nn >= best_threshold_nn).astype(int)
    # 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
    submission_df_nn = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_nn}).reset_index(drop=True)
    submission_df_nn['Id'] = submission_df_nn.index + 4230
    submission_df_nn.to_csv(f'stacking_nn_submission_best_score{best_score_nn:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
    
    
    # ロジスティック回帰モデルの学習データに対する予測確率
    lr_model = pickle.load(open(f'lr_stacking_model_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
    train_pred_proba_lr = lr_model.predict_proba(oof_combined_features_scaled)[:, 1]
    
    # 最適な閾値とスコアを求める
    best_threshold_lr, best_score_lr = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_nn)
    print(f'LR Best Threshold: {best_threshold_lr}, Best F1 Score: {best_score_lr}')
    
    # テストデータに対する最終予測
    test_pred_proba_lr = lr_model.predict_proba(test_combined_features_scaled)[:, 1]
    test_final_predictions_lr = (test_pred_proba_lr >= best_threshold_lr).astype(int)
    # 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
    submission_df_lr = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_lr}).reset_index(drop=True)
    submission_df_lr['Id'] = submission_df_lr.index + 4230
    submission_df_lr.to_csv(f'stacking_lr_submission_best_score{best_score_lr:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)

In [None]:
#予測の実行
Postprocessing(test_df, features, categorical_features)