In [175]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras import backend as K
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import torch.optim as optim
import tensorflow as tf
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 7.1
    AUTHOR = 'naokisusami'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = [ 'neuralnetwork']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': seed,
    }

    model_weight_dict = {'lightgbm': 0.50, 'xgboost': 0.10, 'catboost': 0.40, 'adaboost': 0.10, 'neuralnetwork': 1.00}
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

class MacroF1ScoreCallback(Callback):
    def __init__(self, validation_data):
        super(MacroF1ScoreCallback, self).__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='macro')
        print(f'Epoch {epoch+1}: val_macro_f1: {_val_f1}')
    
def f1_score_nn(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.round(K.flatten(y_pred))
    tp = K.sum(y_true * y_pred)
    fp = K.sum((1 - y_true) * y_pred)
    fn = K.sum(y_true * (1 - y_pred))
    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    f1 = 2 * precision * recall / (precision + recall + K.epsilon())
    return f1

def macro_f1_score_nn(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.round(K.flatten(y_pred))
    tp = K.sum(y_true * y_pred)
    fp = K.sum((1 - y_true) * y_pred)
    fn = K.sum(y_true * (1 - y_pred))
    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    f1 = 2 * precision * recall / (precision + recall + K.epsilon())
    
    # Calculate F1 score for each class
    f1_per_class = []
    for c in range(tf.shape(y_true)[-1]):
        true_positives = K.sum(K.cast(y_true[:, c] * K.round(y_pred[:, c]), 'float'), axis=0)
        possible_positives = K.sum(K.cast(y_true[:, c], 'float'), axis=0)
        predicted_positives = K.sum(K.cast(K.round(y_pred[:, c]), 'float'), axis=0)
        precision = true_positives / (predicted_positives + K.epsilon())
        recall = true_positives / (possible_positives + K.epsilon())
        f1_per_class.append(2 * precision * recall / (precision + recall + K.epsilon()))
        
class CustomEarlyStoppingAndRestoreBestWeights(Callback):
    def __init__(self, validation_data, patience=10):
        super(CustomEarlyStoppingAndRestoreBestWeights, self).__init__()
        self.validation_data = validation_data
        self.patience = patience
        self.best_weights = None
        self.best_epoch = 0
        self.best_f1 = 0.0
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='macro')
        print(f'Epoch {epoch+1}: val_macro_f1: {_val_f1}')
        
        if _val_f1 > self.best_f1:
            self.best_f1 = _val_f1
            self.best_epoch = epoch
            self.wait = 0
            self.best_weights = clone_model(self.model).get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                print(f"Restoring model weights from the end of the best epoch: {self.best_epoch+1}.")
                self.model.set_weights(self.best_weights)
        

    
    


In [178]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        
        return df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    df = deal_missing(input_df)
    df = clean_money(df)
    df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']
        df['ApprovalTerm'] = 15 - df['ApprovalFY']
        df['DisbursementTerm'] = 15 - df['DisbursementYear']


        #経済成長率
        EconomyGrowthdata={-26:-0.6,-25:-0.4,-24:5.6,-23:4.6,-22:5.5,-21:3.2,-20:-0.26,-19:2.54,-18:-1.8,-17:4.58,-16:7.24,-15:4.17,
                           -14:3.46,-13:3.46,-12:4.18,-11:3.67,-10:1.89,-9:-0.11,-8:3.52,-7:2.75,-6:4.03,-5:2.68,-4:3.77,-3:4.45,
                           -2:4.18,-1:4.8,0:4.08,1:0.95,2:1.7,3:2.8,4:3.85,5:3.48,6:2.78,7: 2.01,8:0.12,9:-2.6,10:2.71,11:1.55,12:2.28,
                           13:1.84,14:2.29,15:2.71,16:1.67,17:2.24,18:2.95,19:2.30}
        #Bankraptdataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}
        #失業率
        Unemploymentratedata={-26:5.45,-25:8.7,-24:7.7,-23:7.05,-22:6.05,-21:5.7,-20:7.7,-19:7.35,-18:9.7,-17:9.75,-16:7.35,-15:7.4,
                              -14:7.1,-13:6.15,-12:5.4,-11:5.25,-10:5.35,-9:6.85,-8:7.75,-7:6.95,-6:6.1,-5:5.65,-4:5.4,-3:4.95,-2:4.5,
                              -1:4.3,0:4.0,1:4.55,2:5.8,3:6.25,4:5.55,5:5.0,6:4.65,7:4.65,8:5.7,9:9.5,10:9.4,11:9.05,12:8.2,13:7.4,
                              14:6.15,15:5.25,16:4.9,17:4.3,18:3.9,19:3.6}
        #金利
        Interestratedata={-26:10,-25:7,-24:5,-23:7,-22:8.5,-21:12,-20:15,-19:15,-18:14,-17:9,-16:10,-15:8.5,
                              -14:7,-13:6.5,-12:8,-11:9,-10:7,-9:5.5,-8:3.5,-7:3,-6:4,-5:6,-4:5.5,-3:5.7,-2:5.3,
                              -1:5,0:6.1,1:4.3,2:1.8,3:1.1,4:1.8,5:3.5,6:5.2,7:5,8:2,9:0.25,10:0.25,11:0.25,12:0.25,13:0.25,
                              14:0.25,15:0.25,16:0.5,17:1,18:2,19:2.2}
        #インフレ率
        Inflationratedata={-26:11,-25:9,-24:5,-23:7,-22:8,-21:12,-20:13.5,-19:10.38,-18:6.16,-17:3.16,-16:4.37,-15:3.16,
                           -14:1.94,-13:3.58,-12:4.1,-11:4.79,-10:5.42,-9:4.22,-8:3.04,-7:2.97,-6:2.6,-5:2.81,-4:2.94,-3:2.34,
                           -2:1.55,-1:2.19,0:3.37,1:2.82,2:1.6,3:2.3,4:2.67,5:3.37,6:3.22,7:2.87,8:3.82,9:-0.32,10:1.64,11:3.14,12:2.07,
                           13:1.47,14:1.62,15:0.12,16:1.27,17:2.13,18:2.44,19:1.81}
        #年ごとのデータを、1-5年後の平均に変換
        datalist = [EconomyGrowthdata,Bankraptcydata,Unemploymentratedata,Interestratedata,Inflationratedata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = sum(k.values()) / len(k)
        
        df['EconomyGrowth_By_Year'] = df['DisbursementYear'].map(EconomyGrowthdata)
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)
        df['Unemploymentrate_By_Year'] = df['DisbursementYear'].map(Unemploymentratedata)
        df['Interestrate_By_Year'] = df['DisbursementYear'].map(Interestratedata)
        df['Inflationrate_By_Year'] = df['DisbursementYear'].map(Inflationratedata)
        
        #State関係の特徴量作成
        StateList = ['AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA',
                      'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX',
                      'UT','VT','VA','WA','WV','WI','WY']
        
        UnemploymentList = [2.6,3.7,4.0,3.4,4.1,2.8,4.0,4.6,4.2,2.7,3.1,3.7,2.8,4.6,3.1,3.0,2.9,3.9,3.5,3.1,3.0,3.7,4.3,2.9,4.0,
                          2.7,2.6,2.7,5.5,2.9,3.3,3.5,4.1,3.8,2.1,4.1,3.2,4.8,4.3,3.2,3.3,2.2,3.5,3.8,2.4,3.0,3.1
                            ,4.5,4.1,3.0,3.9]
        
        GDPList = [29603,44807,33655,27781,42376,40805,51911,56496,126421,33417,35265,38850,29843,39568,32724,35814,34770,30364,35181,
                   30282,39596,47351,32846,41353,24477,32590,28201,37075,40210,37375,45052,30943,49038,37053,34694,34040,29470,
                   38339,35153,36543,28894,35596,33742,37793,32774,34197,41617,40361,24929,34890,40303]
        
        GDPperPersonList = [37282,71008,48148,35674,53525,54943,63504,76720,164002,45958,48434,50788,39529,49083,40529,44091,43633,
                       38148,48366,37734,50729,55364,38433,51829,31127,41012,37966,46803,63662,46400,55320,41878,58126,49625,43172,
                       41073,40376,46248,43246,44738,38093,44955,42865,54766,47313,40312,54102,52810,31914,43309,63822]
        
        AveSalaryList = [40.46,50.81,45.40,37.79,56.10,49.79,60.14,49.66,79.85,43.66,46.17,44.09,36.45,51.71,40.97,38.39,40.96,
                        39.54,43.15,39.06,54.28,58.62,45.19,46.99,35.95,42.58,35.81,39.87,44.38,46.38,56.72,40.91,61.04,43.11,41.12,
                        43.45,40.75,43.46,46.10,46.38,39.63,35.00,41.88,48.35,41.11,39.54,52.07,51.04,38.48,41.46,44.03]
        
        Unemploymentdict = dict(zip(StateList,UnemploymentList))
        GDPdict = dict(zip(StateList,GDPList))
        GDPperPersondict = dict(zip(StateList,GDPperPersonList))
        AveSalarydict = dict(zip(StateList,AveSalaryList))
        
        df['Unemployment_By_State'] = df['State'].map(Unemploymentdict)
        df['GDP_By_State'] = df['State'].map(GDPdict)
        df['GDPperPerson_By_State'] = df['State'].map(GDPperPersondict)
        df['AveSalary_By_State'] = df['State'].map(AveSalarydict)
        
        #現状グループ分けされない特徴量の作成
        #企業の安定さ、デカさ
        df['BCI'] = df['CompanyLong']*(df['NoEmp'])*(df['NewExist']+1)
        df['BCI'] = df['BCI'].fillna(df['BCI'].mean)
        #一か月あたりの返済必要量
        df['DisbursementGrossPerMonth'] = df['DisbursementGross']/(df['Term']+1)
        #SBA承認より減らした額
        df['SBA_Appv-DisbursementGross'] = df['SBA_Appv']-df['DisbursementGross']
        #本来の従業員一人当たりの返済必要量
        df['DisbursementGrossPerNoEmp'] = df['DisbursementGross']/(df['NoEmp']+1)
        #雇用創出後の従業員一人当たりの返済必要量
        df['DisbursementGrossPerEmp'] = df['DisbursementGross']/(df['NoEmp']+df['CreateJob']+1)
        #しんどさ指数
        df['TI'] = (df['DisbursementGross']/(df['NoEmp']+df['CreateJob']+1))/(df['Term']+1)
        #しんどさ指数2
        df['TI2'] = (df['SBA_Appv']/(df['NoEmp']+df['CreateJob']+1))/(df['Term']+1)


        
        return df
    df = make_features(df)
    return df

#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)

#ラベルエンコーディング
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
categorical_features_unlabelable = ['City','ApprovalDate','BankState','DisbursementDate']
'''
for col in categorical_features_unlabelable:
    le = LabelEncoder()   
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))
'''
for col in categorical_features_unlabelable:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])
    
#OneHotEncoding
train_df2 = train_df.drop(['MIS_Status'],axis=1)
OneHotList = ['RevLineCr', 'LowDoc','State','Sector']
ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
train_df2 = ohe.fit_transform(train_df2)
test_df = ohe.transform(test_df)
train_df = pd.concat([train_df2,train_df['MIS_Status']],axis=1)

OneHotedList = ['RevLineCr_0.0','RevLineCr_1.0','RevLineCr_2.0','RevLineCr_3.0','RevLineCr_4.0','LowDoc_0.0','LowDoc_1.0','LowDoc_2.0','LowDoc_3.0','LowDoc_4.0','LowDoc_5.0','LowDoc_6.0']

for i in range(51):
    OneHotedList.append(f'State_{i}.0')
    
for i in range(24):
    OneHotedList.append(f'Sector_{i}.0')


RemoveList=['MIS_Status','City','ApprovalDate','BankState','DisbursementDate','ApprovalDay','ApprovalMonth','ApprovalFY','ApprovalYear',
           'DisbursementDay','DisbursementMonth']

features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)

scalelist = features
for i in OneHotedList:
    print(i)
    scalelist.remove(i)
    
stdscl = StandardScaler()
train_df[scalelist] = stdscl.fit_transform(train_df[scalelist])
test_df[scalelist] = stdscl.fit_transform(test_df[scalelist])

RemoveList=['MIS_Status','City','ApprovalDate','BankState','DisbursementDate','ApprovalDay','ApprovalMonth','ApprovalFY','ApprovalYear',
           'DisbursementDay','DisbursementMonth']
features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)

    
print(train_df)
train_df.info()
print(features)



MIS_Status
City
ApprovalDate
BankState
DisbursementDate
ApprovalDay
ApprovalMonth
ApprovalFY
ApprovalYear
DisbursementDay
DisbursementMonth
RevLineCr_0.0
RevLineCr_1.0
RevLineCr_2.0
RevLineCr_3.0
RevLineCr_4.0
LowDoc_0.0
LowDoc_1.0
LowDoc_2.0
LowDoc_3.0
LowDoc_4.0
LowDoc_5.0
LowDoc_6.0
State_0.0
State_1.0
State_2.0
State_3.0
State_4.0
State_5.0
State_6.0
State_7.0
State_8.0
State_9.0
State_10.0
State_11.0
State_12.0
State_13.0
State_14.0
State_15.0
State_16.0
State_17.0
State_18.0
State_19.0
State_20.0
State_21.0
State_22.0
State_23.0
State_24.0
State_25.0
State_26.0
State_27.0
State_28.0
State_29.0
State_30.0
State_31.0
State_32.0
State_33.0
State_34.0
State_35.0
State_36.0
State_37.0
State_38.0
State_39.0
State_40.0
State_41.0
State_42.0
State_43.0
State_44.0
State_45.0
State_46.0
State_47.0
State_48.0
State_49.0
State_50.0
Sector_0.0
Sector_1.0
Sector_2.0
Sector_3.0
Sector_4.0
Sector_5.0
Sector_6.0
Sector_7.0
Sector_8.0
Sector_9.0
Sector_10.0
Sector_11.0
Sector_12.0
Sector_13.0
Sect

In [None]:
def build_model(input_shape):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.5))  # Dropout層を追加
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))  
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# モデルの学習と評価
def nn_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = build_model(x_train.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(x_train, y_train, epochs=100, batch_size=32, verbose=1,
          validation_data=(x_valid, y_valid),
          callbacks=[MacroF1ScoreCallback(validation_data=(x_valid, y_valid)), early_stopping])
    valid_pred = model.predict(x_valid)
    valid_pred = valid_pred.flatten()  # 1 次元の配列に変換する
    return  model, valid_pred


#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        
        model = None  # モデル変数を初期化する
        valid_pred = None

        if method == 'neuralnetwork':
            model, valid_pred = nn_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            model.save(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.h5')
        # Save best model
        #pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)
        
Learning(train_df, features, categorical_features)

def nn_inference( x_test: pd.DataFrame):
    test_pred = np.zeros((x_test.shape[0],))
    for fold in range(CFG.n_folds):
        model = load_model(f'neuralnetwork_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.h5')
        # Predict
        pred = model.predict(x_test)
        pred = pred.flatten()  # 1 次元の配列に変換する
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'neuralnetwork':
        test_pred = nn_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

test_df = Predicting(test_df, features, categorical_features)

--------------------------------------------------
neuralnetwork training fold 1
Epoch 1/100
Epoch 1: val_macro_f1: 0.47652193742885135
Epoch 2/100
Epoch 2: val_macro_f1: 0.5851648628554725
Epoch 3/100
Epoch 3: val_macro_f1: 0.5928345966709316
Epoch 4/100

In [None]:
#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

#後処理
train_df, test_df = Postprocessing(train_df, test_df)

test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# OOF予測を基に新たな特徴量を作成
oof_features = np.zeros((train_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    oof_features[:, i] = oof_df[f'{method}_prediction']

# テストデータの予測を基に特徴量を作成
test_features = np.zeros((test_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    test_features[:, i] = test_df[f'{method}_pred_prob']

# 特徴量の標準化
scaler = StandardScaler()
oof_features_scaled = scaler.fit_transform(oof_features)
test_features_scaled = scaler.transform(test_features)

# ロジスティック回帰モデルをパラメータチューニング・学習
'''
logistic = LogisticRegression()
param_grid = {'C': [1]}
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5)
grid_search.fit(oof_features_scaled, train_df[CFG.target_col])
print('Best Parameter:',grid_search.best_params_)
print('Best Score:',grid_search.best_score_)
lr = LogisticRegression(C=grid_search.best_params_['C'])
'''
lr = LogisticRegression()
lr.fit(oof_features_scaled, train_df[CFG.target_col])

# 最適な閾値とその時のF1スコアを探索する関数
def find_best_threshold_and_score(y_true, y_pred_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.linspace(0, 1, 1001):  # 0.001刻みで閾値を変更
        score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# 学習データに対する予測確率
train_pred_proba = lr.predict_proba(oof_features_scaled)[:, 1]

# 最適な閾値とスコアを求める
best_threshold, best_score = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba)
print(f'Best Threshold: {best_threshold}, Best F1 Score: {best_score}')

# テストデータに対する最終予測
test_pred_proba = lr.predict_proba(test_features_scaled)[:, 1]
test_final_predictions = (test_pred_proba >= best_threshold).astype(int)

# 最終予渲結果をコンペ提出用のフォーマットでCSVファイルに出力
submission_df = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions}).reset_index(drop=True)
# ここで、インデックスの開始が42307であるため、その値から始めるように調整
submission_df['Id'] = submission_df.index + 42307

submission_df.to_csv(f'stacking_lr_submission_best_score{best_score:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
