In [120]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import torch.optim as optim

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 8.1
    AUTHOR = 'naokisusami'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = [ 'neuralnetwork', 'adaboost','lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': seed,
    }
    nn_params = {
        'input_size': 44,  # 特徴量の数に応じて変更してください
        'hidden_size': [64, 32],  # 隠れ層のユニット数
        'output_size': 1,  # 出力層のユニット数
        'dropout_rate': 0.1,
        'learning_rate': 0.001,
        'batch_size': 64,
        'epochs': 10,
    }

    model_weight_dict = {'lightgbm': 0.30, 'xgboost': 0.10, 'catboost': 0.30, 'adaboost': 0.15, 'neuralnetwork': 0.15}
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

class SimpleNN(nn.Module):
    def __init__(self, cfg):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(cfg['input_size'], cfg['hidden_size'][0])
        self.dropout = nn.Dropout(cfg['dropout_rate'])
        self.fc2 = nn.Linear(cfg['hidden_size'][0], cfg['hidden_size'][1])
        self.fc3 = nn.Linear(cfg['hidden_size'][1], cfg['output_size'])
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x


In [121]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']

In [122]:
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        
        return df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    df = deal_missing(input_df)
    df = clean_money(df)
    df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']

        #Bankraptcydataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}

        #年ごとのデータを、1-5年後の平均に変換
        datalist = [Bankraptcydata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = k[-26]*2
        
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)

        #組み合わせ特徴量
        df['State_Sector'] = df['State'].astype(str) + '_' + df['Sector'].astype(str)
         # 地理的特徴の組み合わせ
        df['City_State'] = df['City'] + '_' + df['State']
        # 時間的特徴の組み合わせ
        df['ApprovalFY_Term'] = df['ApprovalFY'].astype(str) + '_' + df['Term'].astype(str)
        
        df['FranchiseCode_ApprovalDate'] = df['FranchiseCode'].astype(str) + '_' + df['ApprovalDate'].astype(str)
        
        df['Term_NoEmp'] = df['Term'].astype(str) + '_' + df['NoEmp'].astype(str)
        
        df['City_BankState'] = df['City'].astype(str) + '_' + df['BankState'].astype(str)
        
        df['NoEmp_SBA_Appv'] = df['NoEmp'].astype(str) + '_' + df['SBA_Appv'].astype(str)
        


        
        return df
    df = make_features(df)
    return df

In [123]:
#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)

In [124]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Term                        42307 non-null  int64  
 1   NoEmp                       42307 non-null  int64  
 2   NewExist                    42307 non-null  int32  
 3   CreateJob                   42307 non-null  int64  
 4   RetainedJob                 42307 non-null  int64  
 5   FranchiseCode               42307 non-null  int64  
 6   RevLineCr                   42307 non-null  object 
 7   LowDoc                      42307 non-null  object 
 8   DisbursementDate            42307 non-null  object 
 9   MIS_Status                  42307 non-null  int64  
 10  Sector                      42307 non-null  int64  
 11  ApprovalDate                42307 non-null  object 
 12  ApprovalFY                  42307 non-null  int64  
 13  City                        423

（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [125]:
'''
#カウントエンコーディング
for col in categorical_features:
    count_dict = dict(train_df[col].value_counts())
    train_df[f'{col}_count_encoding'] = train_df[col].map(count_dict).astype(int)
    test_df[f'{col}_count_encoding'] = test_df[col].map(count_dict).fillna(1).astype(int)
'''

#ラベルエンコーディング
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
categorical_features_unlabelable = ['ApprovalFY_Term','City_State','City','ApprovalDate','BankState','DisbursementDate','State_Sector',
                                   'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']
'''
for col in categorical_features_unlabelable:
    le = LabelEncoder()   
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))
'''
for col in categorical_features_unlabelable:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [126]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Term                        42307 non-null  int64  
 1   NoEmp                       42307 non-null  int64  
 2   NewExist                    42307 non-null  int32  
 3   CreateJob                   42307 non-null  int64  
 4   RetainedJob                 42307 non-null  int64  
 5   FranchiseCode               42307 non-null  int64  
 6   RevLineCr                   42307 non-null  int32  
 7   LowDoc                      42307 non-null  int32  
 8   DisbursementDate            42307 non-null  int32  
 9   MIS_Status                  42307 non-null  int64  
 10  Sector                      42307 non-null  int64  
 11  ApprovalDate                42307 non-null  int32  
 12  ApprovalFY                  42307 non-null  int64  
 13  City                        423

In [127]:
#OneHotEncoding
train_df2 = train_df.drop(['MIS_Status'],axis=1)
OneHotList = ['RevLineCr', 'LowDoc']
ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
train_df2 = ohe.fit_transform(train_df2)
test_df = ohe.transform(test_df)
train_df = pd.concat([train_df2,train_df['MIS_Status']],axis=1)

#featuresの作成
categorical_features = ['State', 'Sector','RevLineCr_0.0','RevLineCr_1.0','RevLineCr_2.0','RevLineCr_3.0','RevLineCr_4.0',
                       'LowDoc_0.0','LowDoc_1.0','LowDoc_2.0','LowDoc_3.0','LowDoc_4.0','LowDoc_5.0','LowDoc_6.0',
                       'ApprovalFY_Term','City_State','City','ApprovalDate','BankState','State_Sector','UrbanRural',
                        'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']


RemoveList=['MIS_Status']
features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)

MIS_Status


In [128]:
print(train_df)
print(features)

       Term  NoEmp  NewExist  CreateJob  RetainedJob  FranchiseCode  \
0       163     21         1          0            0              1   
1        84      6         1          4            0              0   
2       242     45         1          4           90              0   
3       237      4         1          0            0              0   
4       184      0         1          0            0              0   
...     ...    ...       ...        ...          ...            ...   
42302   283     14         1          0            0              1   
42303    53      2         1          0            0              0   
42304    59      6         0          0            0              1   
42305   295     18         1          0            8              0   
42306    84      4         1          0            8              0   

       RevLineCr_1.0  RevLineCr_0.0  RevLineCr_4.0  RevLineCr_3.0  \
0                  1              0              0              0   
1        

In [129]:
# AdaBoost training
def adaboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = AdaBoostClassifier(**CFG.classification_adaboost_params)
    model.fit(x_train, y_train)
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred


def nn_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    stdscl = StandardScaler()
    x_train_scaled = stdscl.fit_transform(x_train[features])
    x_valid_scaled = stdscl.transform(x_valid[features])
    
    # データをテンソルに変換
    x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    x_valid_tensor = torch.tensor(x_valid_scaled, dtype=torch.float32)
    
    train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=CFG.nn_params['batch_size'], shuffle=True)
    
    model = SimpleNN(CFG.nn_params)
    optimizer = optim.Adam(model.parameters(), lr=CFG.nn_params['learning_rate'])
    criterion = nn.BCEWithLogitsLoss()
    
    model.train()
    for epoch in range(CFG.nn_params['epochs']):
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
    # バリデーションデータで予測
    model.eval()
    with torch.no_grad():
        valid_pred = torch.sigmoid(model(x_valid_tensor)).numpy().reshape(-1)
    
    return model, valid_pred


#lightgbmでの学習メソッドの定義
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

#xgboostでの学習メソッドの定義
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

#catboostでの学習メソッドの定義
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

    
                    
#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        
        model = None  # モデル変数を初期化する
        valid_pred = None
        
        if method == 'adaboost':
            model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'neuralnetwork':
            model, valid_pred = nn_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        # Save best model
        pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [130]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
neuralnetwork training fold 1
--------------------------------------------------
neuralnetwork training fold 2
--------------------------------------------------
neuralnetwork training fold 3
--------------------------------------------------
neuralnetwork training fold 4
--------------------------------------------------
neuralnetwork training fold 5
--------------------------------------------------
neuralnetwork training fold 6
--------------------------------------------------
neuralnetwork training fold 7
neuralnetwork our out of folds CV f1score is 0.4716512226190774
--------------------------------------------------
adaboost training fold 1
--------------------------------------------------
adaboost training fold 2
--------------------------------------------------
adaboost training fold 3
--------------------------------------------------
adaboost training fold 4
--------------------------------------------------
adaboost train

lightgbm our out of folds CV f1score is 0.5607074460627226
--------------------------------------------------
xgboost training fold 1
[0]	train-logloss:0.65987	train-f1score:0.09771	eval-logloss:0.65991	eval-f1score:0.09208
[25]	train-logloss:0.33436	train-f1score:0.67461	eval-logloss:0.33735	eval-f1score:0.63857
[50]	train-logloss:0.27694	train-f1score:0.67006	eval-logloss:0.28700	eval-f1score:0.63626
[75]	train-logloss:0.25885	train-f1score:0.68041	eval-logloss:0.27829	eval-f1score:0.64120
[100]	train-logloss:0.24868	train-f1score:0.68964	eval-logloss:0.27656	eval-f1score:0.64572
[125]	train-logloss:0.24120	train-f1score:0.69876	eval-logloss:0.27580	eval-f1score:0.64874
[150]	train-logloss:0.23629	train-f1score:0.70302	eval-logloss:0.27564	eval-f1score:0.64948
[175]	train-logloss:0.23225	train-f1score:0.70764	eval-logloss:0.27568	eval-f1score:0.65045
[200]	train-logloss:0.22882	train-f1score:0.71246	eval-logloss:0.27600	eval-f1score:0.65142
[225]	train-logloss:0.22531	train-f1score:0

[200]	train-logloss:0.22592	train-f1score:0.71443	eval-logloss:0.28752	eval-f1score:0.67481
[214]	train-logloss:0.22407	train-f1score:0.71638	eval-logloss:0.28791	eval-f1score:0.67515
--------------------------------------------------
xgboost training fold 7
[0]	train-logloss:0.65982	train-f1score:0.09712	eval-logloss:0.66010	eval-f1score:0.09563
[25]	train-logloss:0.33351	train-f1score:0.67217	eval-logloss:0.34093	eval-f1score:0.65948
[50]	train-logloss:0.27627	train-f1score:0.67016	eval-logloss:0.29170	eval-f1score:0.64729
[75]	train-logloss:0.25787	train-f1score:0.67929	eval-logloss:0.28301	eval-f1score:0.65490
[100]	train-logloss:0.24739	train-f1score:0.69177	eval-logloss:0.28078	eval-f1score:0.65800
[125]	train-logloss:0.24036	train-f1score:0.69821	eval-logloss:0.28063	eval-f1score:0.66054
[150]	train-logloss:0.23381	train-f1score:0.70431	eval-logloss:0.28078	eval-f1score:0.66334
[175]	train-logloss:0.22860	train-f1score:0.71080	eval-logloss:0.28124	eval-f1score:0.66388
[200]	trai

100:	learn: 0.2736948	test: 0.2693702	best: 0.2693702 (100)	total: 42.6s	remaining: 2m 48s
125:	learn: 0.2714833	test: 0.2684514	best: 0.2684514 (125)	total: 53.4s	remaining: 2m 38s
150:	learn: 0.2697479	test: 0.2680771	best: 0.2680771 (150)	total: 1m 4s	remaining: 2m 29s
175:	learn: 0.2679713	test: 0.2676590	best: 0.2676439 (174)	total: 1m 15s	remaining: 2m 19s
200:	learn: 0.2662342	test: 0.2673579	best: 0.2673579 (200)	total: 1m 26s	remaining: 2m 8s
225:	learn: 0.2647178	test: 0.2672237	best: 0.2671027 (218)	total: 1m 38s	remaining: 1m 58s
250:	learn: 0.2631433	test: 0.2670567	best: 0.2670394 (249)	total: 1m 49s	remaining: 1m 49s
275:	learn: 0.2617138	test: 0.2670857	best: 0.2669502 (265)	total: 2m 1s	remaining: 1m 38s
300:	learn: 0.2602883	test: 0.2670595	best: 0.2669502 (265)	total: 2m 13s	remaining: 1m 28s
325:	learn: 0.2589548	test: 0.2670677	best: 0.2669502 (265)	total: 2m 24s	remaining: 1m 17s
350:	learn: 0.2576638	test: 0.2669812	best: 0.2669117 (337)	total: 2m 36s	remaining: 

In [131]:
def adaboost_inference(x_test):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def nn_inference( x_test: pd.DataFrame):
    stdscl = StandardScaler()
    x_test = stdscl.fit_transform(x_test)
    x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
    test_pred = np.zeros((x_test.shape[0],))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'neuralnetwork_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        model.eval()
        with torch.no_grad():
            pred = torch.sigmoid(model(x_test_tensor)).numpy().reshape(-1)
            test_pred += pred
    return test_pred / CFG.n_folds

def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds

def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'adaboost':
        test_pred = adaboost_inference(x_test)
    if method == 'neuralnetwork':
        test_pred = nn_inference(x_test)
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [132]:
test_df = Predicting(test_df, features, categorical_features)

In [133]:
#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

In [134]:
'''
#後処理の定義、調和平均版 
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()): 
    train_df['pred_prob'] = 0 
    weight_sum = 0 
    for method in CFG.METHOD_LIST: 
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv') 
        train_df['pred_prob'] += CFG.model_weight_dict[method] / oof_df[f'{method}_prediction'] 
        weight_sum += CFG.model_weight_dict[method] 
    train_df['pred_prob'] = weight_sum / train_df['pred_prob'] 
    best_score = 0 
    best_v = 0 
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro') 
        if score > best_score: 
            best_score = score 
            best_v = v 
    print(best_score, best_v) 
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df
'''

"\n#後処理の定義、調和平均版 \ndef Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()): \n    train_df['pred_prob'] = 0 \n    weight_sum = 0 \n    for method in CFG.METHOD_LIST: \n        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv') \n        train_df['pred_prob'] += CFG.model_weight_dict[method] / oof_df[f'{method}_prediction'] \n        weight_sum += CFG.model_weight_dict[method] \n    train_df['pred_prob'] = weight_sum / train_df['pred_prob'] \n    best_score = 0 \n    best_v = 0 \n    for v in tqdm(np.arange(1000) / 1000):\n        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro') \n        if score > best_score: \n            best_score = score \n            best_v = v \n    print(best_score, best_v) \n    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)\n    return train_df, test_df\n"

In [135]:
#後処理
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6882359316201263 0.937


In [136]:
test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

特徴量の重要度を確認する方法

In [137]:
model = pickle.load(open(f'lightgbm_fold1_seed42_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
City_BankState,0.137037
State_Sector,0.107407
Term_NoEmp,0.092593
NoEmp_SBA_Appv,0.091358
ApprovalFY_Term,0.08642
ApprovalDate,0.081481
FranchiseCode_ApprovalDate,0.07284
NoEmp,0.04321
UrbanRural,0.041975
LowDoc_4.0,0.030864


In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# OOF予測を基に新たな特徴量を作成
oof_features = np.zeros((train_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    oof_features[:, i] = oof_df[f'{method}_prediction']

# テストデータの予測を基に特徴量を作成
test_features = np.zeros((test_df.shape[0], len(CFG.METHOD_LIST)))
for i, method in enumerate(CFG.METHOD_LIST):
    test_features[:, i] = test_df[f'{method}_pred_prob']

# 特徴量の標準化
scaler = StandardScaler()
oof_features_scaled = scaler.fit_transform(oof_features)
test_features_scaled = scaler.transform(test_features)

# ロジスティック回帰モデルをパラメータチューニング・学習
logistic = LogisticRegression()
param_grid = {'C': [1]}
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5)
grid_search.fit(oof_features_scaled, train_df[CFG.target_col])
print('Best Parameter:',grid_search.best_params_)
print('Best Score:',grid_search.best_score_)
lr = LogisticRegression(C=grid_search.best_params_['C'])
lr.fit(oof_features_scaled, train_df[CFG.target_col])

# 最適な閾値とその時のF1スコアを探索する関数
def find_best_threshold_and_score(y_true, y_pred_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.linspace(0, 1, 1001):  # 0.001刻みで閾値を変更
        score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# 学習データに対する予測確率
train_pred_proba = lr.predict_proba(oof_features_scaled)[:, 1]

# 最適な閾値とスコアを求める
best_threshold, best_score = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba)
print(f'Best Threshold: {best_threshold}, Best F1 Score: {best_score}')

# テストデータに対する最終予測
test_pred_proba = lr.predict_proba(test_features_scaled)[:, 1]
test_final_predictions = (test_pred_proba >= best_threshold).astype(int)

# 最終予渲結果をコンペ提出用のフォーマットでCSVファイルに出力
submission_df = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions}).reset_index(drop=True)
# ここで、インデックスの開始が42307であるため、その値から始めるように調整
submission_df['Id'] = submission_df.index + 42307

submission_df.to_csv(f'stacking_lr_submission_best_score{best_score:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)


Best Parameter: {'C': 1}
Best Score: 0.9044129346122439
Best Threshold: 0.774, Best F1 Score: 0.6883426774356559
