SVMえぐい時間かかるので気を付けて(7foldで4時間とか)

In [73]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 14
    AUTHOR = 'Yuta.K'
    COMPETITION = 'FDUA2'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = [ 'adaboost','lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    
    model_weight_dict = {'adaboost': 0.10,'lightgbm': 0.25, 'xgboost': 0.10, 'catboost': 0.25}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [74]:
#データの読み込み
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)
categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']

In [75]:
#前処理メソッドの定義
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    #欠損値に対する処理
    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['RevLineCr', 'LowDoc', 'BankState']:
            df[col] = input_df[col].fillna('UNK')
        for col in ['DisbursementDate','ApprovalDate']:
            df[col] = input_df[col].fillna('50-NaN-50')
        
        return df
    #金額に対する前処理
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return df
    df = deal_missing(input_df)
    df = clean_money(df)
    df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)
    #特徴量作成
    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        df = input_df.copy()
        #日付関係の特徴量作成
        df[['DisbursementDay','DisbursementMonth','DisbursementYear']] = df['DisbursementDate'].str.split('-',expand=True)
        df[['ApprovalDay','ApprovalMonth','ApprovalYear']] = df['ApprovalDate'].str.split('-',expand=True)
        df['DisbursementDay'] = df['DisbursementDay'].astype(int)
        df['DisbursementYear'] = df['DisbursementYear'].astype(int)
        df['ApprovalDay'] = df['ApprovalDay'].astype(int)
        df['ApprovalYear'] = df['ApprovalYear'].astype(int)
        Month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12,'NaN':50}
        df['DisbursementMonth'] = df['DisbursementMonth'].map(Month_dict)
        df['ApprovalMonth'] = df['ApprovalMonth'].map(Month_dict)
        df['DisbursementDate'] = df['DisbursementYear'].astype(str)+df['DisbursementMonth'].astype(str)+df['DisbursementDay'].astype(str)
        df['DisbursementYear'] = df['DisbursementYear'].apply(lambda x:x - 100 if x >50 else x)
        df['ApprovalYear'] = df['ApprovalYear'].apply(lambda x:x - 100 if x >50 else x)
        df['CompanyLong'] = df['DisbursementYear'] - df['ApprovalYear']

        #Bankraptcydataの74~80は生成したもので実際の数値ではない。(失業率から換算して生成)
        Bankraptcydata={-26:32700,-25:52200,-24:46200,-23:42300,-22:36300,-21:34200,-20:46200,-19:44000,-18:48500,-17:69800,-16:62500,
                      -15:64500,-14:72000,-13:81500,-12:83000,-11:64500,-10:65000,-9:67000,-8:71000,-7:67000,-6:58000,-5:51000,
                        -4:52500,-3:54000,-2:51000,-1:41000,0:37500,1:35992,2:39845,3:37548,4:36785,5:31952,6:35292,7:21960,8:30741,
                        9:49091,10:61148,11:54212,12:46393,13:37552,14:31671,15:26130,16:24797,17:23591,18:23106,19:22157}

        #年ごとのデータを、1-5年後の平均に変換
        datalist = [Bankraptcydata]
        for k in datalist:
            for i in range(len(k)-5):
                k[-27+i] = 0
                for j in range(5):
                    k[-27+i] += k[-26+i+j]
                k[-27+i] = k[-27+i]/5
            k[50] = k[-26]*2
        
        df['Bankraptcy_By_Year'] = df['DisbursementYear'].map(Bankraptcydata)

        #組み合わせ特徴量
        df['State_Sector'] = df['State'].astype(str) + '_' + df['Sector'].astype(str)
         # 地理的特徴の組み合わせ
        df['City_State'] = df['City'] + '_' + df['State']
        # 時間的特徴の組み合わせ
        df['ApprovalFY_Term'] = df['ApprovalFY'].astype(str) + '_' + df['Term'].astype(str)
        
        df['FranchiseCode_ApprovalDate'] = df['FranchiseCode'].astype(str) + '_' + df['ApprovalDate'].astype(str)
        
        df['Term_NoEmp'] = df['Term'].astype(str) + '_' + df['NoEmp'].astype(str)
        
        df['City_BankState'] = df['City'].astype(str) + '_' + df['BankState'].astype(str)
        
        df['NoEmp_SBA_Appv'] = df['NoEmp'].astype(str) + '_' + df['SBA_Appv'].astype(str)
        
        
        
        #特徴量の加工
        #lowdoc ['LowDoc_Y', 'LowDoc_S', 'LowDoc_N', 'LowDoc_A', 'LowDoc_C', 'LowDoc_0', 'LowDoc_UNK']
        df['LowDoc_Y'] = (df['LowDoc'] == 'Y').astype(int)
        df['LowDoc_S'] = (df['LowDoc'] == 'S').astype(int)
        df['LowDoc_N'] = (df['LowDoc'] == 'N').astype(int)
        df['LowDoc_C'] = (df['LowDoc'] == 'C').astype(int)
        df['LowDoc_A'] = (df['LowDoc'] == 'A').astype(int)
        df['LowDoc_0'] = (df['LowDoc'] == '0').astype(int)
        df['LowDoc_UNK'] = (df['LowDoc'] == 'UNK').astype(int)

        #RevLineCr ['RevLineCr_Y', 'RevLineCr_T', 'RevLineCr_N', 'RevLineCr_0', 'RevLineCr_UNK']
        df['RevLineCr_Y'] = (df['RevLineCr'] == 'Y').astype(int)
        df['RevLineCr_T'] = (df['RevLineCr'] == 'T').astype(int)
        df['RevLineCr_N'] = (df['RevLineCr'] == 'N').astype(int)
        df['RevLineCr_0'] = (df['RevLineCr'] == '0').astype(int)
        df['RevLineCr_UNK'] = (df['RevLineCr'] == 'UNK').astype(int)


        
        return df
    df = make_features(df)
    return df

In [76]:
#前処理の実行
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)

（以下はPreprocessingに本来組み込むべきだが，コードが煩雑になるので，いったん切り出している．）

In [77]:
#ラベルエンコーディング
for col in categorical_features :
    le = LabelEncoder()
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    
categorical_features_unlabelable = ['ApprovalFY_Term','City_State','City','ApprovalDate','BankState','DisbursementDate','State_Sector',
                                   'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']
'''
for col in categorical_features_unlabelable:
    le = LabelEncoder()   
    le.fit(train_df[col])
    train_df[col] = le.transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else len(le.classes_))
'''
for col in categorical_features_unlabelable:
    encoder = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0)
    encoder.fit(combined)
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [78]:
#featuresの作成
categorical_features = ['State', 'Sector','RevLineCr_Y', 'RevLineCr_T', 'RevLineCr_N', 'RevLineCr_0', 'RevLineCr_UNK',
                       'LowDoc_Y', 'LowDoc_S', 'LowDoc_N', 'LowDoc_A', 'LowDoc_C', 'LowDoc_0', 'LowDoc_UNK',
                       'ApprovalFY_Term','City_State','City','ApprovalDate','BankState','State_Sector','UrbanRural',
                        'FranchiseCode_ApprovalDate','Term_NoEmp','City_BankState','NoEmp_SBA_Appv']


RemoveList=['MIS_Status','ApprovalYear']
features = train_df.columns.tolist()
for i in RemoveList:
    print(i)
    features.remove(i)

MIS_Status
ApprovalYear


In [79]:
# AdaBoost training
def adaboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = AdaBoostClassifier(**CFG.classification_adaboost_params)
    model.fit(x_train, y_train)
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

#svm training
def svm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = SVC(**CFG.classification_svm_params)  # SVMモデルの初期化
    model.fit(x_train[features], y_train)  # モデルのトレーニング
    valid_pred = model.predict_proba(x_valid[features])[:, 1]  # 予測確率の取得
    return model, valid_pred

# SGDClassifier training
def sgd_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = SGDClassifier(**CFG.classification_sgd_params)
    model.fit(x_train[features], y_train)
    valid_pred = model.predict_proba(x_valid[features])[:, 1]
    return model, valid_pred

# RandomForest training
def randomforest_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    model = RandomForestClassifier(**CFG.classification_randomforest_params)
    model.fit(x_train[features], y_train)
    valid_pred = model.predict_proba(x_valid[features])[:, 1]
    return model, valid_pred
    
#lightgbmでの学習メソッドの定義
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

#xgboostでの学習メソッドの定義
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

#catboostでの学習メソッドの定義
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

    
                    
#任意のモデルでのクロスバリデーション学習メソッドの定義
def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        
        model = None  # モデル変数を初期化する
        valid_pred = None
        
        if method == 'adaboost':
            model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)  
        # Save best model
        pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} our out of folds CV f1score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)
#学習メソッドの定義
def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [80]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
adaboost training fold 1
--------------------------------------------------
adaboost training fold 2
--------------------------------------------------
adaboost training fold 3
--------------------------------------------------
adaboost training fold 4
--------------------------------------------------
adaboost training fold 5
--------------------------------------------------
adaboost training fold 6
--------------------------------------------------
adaboost training fold 7
adaboost our out of folds CV f1score is 0.6391065744612927
--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 32336, number of negative: 3927
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19034
[L

[50]	train-logloss:0.27675	train-f1score:0.67204	eval-logloss:0.28694	eval-f1score:0.63579
[75]	train-logloss:0.25872	train-f1score:0.67967	eval-logloss:0.27890	eval-f1score:0.63887
[100]	train-logloss:0.24815	train-f1score:0.68899	eval-logloss:0.27688	eval-f1score:0.64724
[125]	train-logloss:0.24082	train-f1score:0.69676	eval-logloss:0.27607	eval-f1score:0.64973
[150]	train-logloss:0.23532	train-f1score:0.70519	eval-logloss:0.27602	eval-f1score:0.65144
[175]	train-logloss:0.23110	train-f1score:0.70842	eval-logloss:0.27584	eval-f1score:0.65216
[200]	train-logloss:0.22724	train-f1score:0.71417	eval-logloss:0.27589	eval-f1score:0.65408
[225]	train-logloss:0.22435	train-f1score:0.71854	eval-logloss:0.27623	eval-f1score:0.65237
[250]	train-logloss:0.22075	train-f1score:0.72378	eval-logloss:0.27626	eval-f1score:0.65479
[275]	train-logloss:0.21616	train-f1score:0.72915	eval-logloss:0.27629	eval-f1score:0.65355
[300]	train-logloss:0.21303	train-f1score:0.73595	eval-logloss:0.27657	eval-f1scor

[200]	train-logloss:0.22566	train-f1score:0.71188	eval-logloss:0.28733	eval-f1score:0.67771
[215]	train-logloss:0.22371	train-f1score:0.71422	eval-logloss:0.28750	eval-f1score:0.67853
--------------------------------------------------
xgboost training fold 7
[0]	train-logloss:0.65982	train-f1score:0.09712	eval-logloss:0.66009	eval-f1score:0.09563
[25]	train-logloss:0.33341	train-f1score:0.67325	eval-logloss:0.34086	eval-f1score:0.66005
[50]	train-logloss:0.27619	train-f1score:0.67256	eval-logloss:0.29152	eval-f1score:0.65862
[75]	train-logloss:0.25776	train-f1score:0.68215	eval-logloss:0.28280	eval-f1score:0.66119
[100]	train-logloss:0.24749	train-f1score:0.69120	eval-logloss:0.28051	eval-f1score:0.66298
[125]	train-logloss:0.24069	train-f1score:0.69848	eval-logloss:0.28044	eval-f1score:0.65900
[150]	train-logloss:0.23421	train-f1score:0.70508	eval-logloss:0.28031	eval-f1score:0.66246
[175]	train-logloss:0.22892	train-f1score:0.71179	eval-logloss:0.28069	eval-f1score:0.66213
[200]	trai

499:	learn: 0.2486217	test: 0.2752142	best: 0.2748466 (450)	total: 2m 39s	remaining: 0us

bestTest = 0.2748466439
bestIteration = 450

Shrink model to first 451 iterations.
--------------------------------------------------
catboost training fold 4
0:	learn: 0.6437064	test: 0.6430300	best: 0.6430300 (0)	total: 254ms	remaining: 2m 6s
25:	learn: 0.3090644	test: 0.3016502	best: 0.3016502 (25)	total: 6.92s	remaining: 2m 6s
50:	learn: 0.2818792	test: 0.2749468	best: 0.2749468 (50)	total: 13.5s	remaining: 1m 58s
75:	learn: 0.2757980	test: 0.2704123	best: 0.2704123 (75)	total: 19.8s	remaining: 1m 50s
100:	learn: 0.2726098	test: 0.2690655	best: 0.2690655 (100)	total: 26.4s	remaining: 1m 44s
125:	learn: 0.2707361	test: 0.2684003	best: 0.2683836 (124)	total: 32.3s	remaining: 1m 35s
150:	learn: 0.2691604	test: 0.2680466	best: 0.2680458 (149)	total: 39s	remaining: 1m 30s
175:	learn: 0.2673552	test: 0.2676483	best: 0.2676483 (175)	total: 45.9s	remaining: 1m 24s
200:	learn: 0.2658064	test: 0.2673558

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2761682807
bestIteration = 288

Shrink model to first 289 iterations.
catboost our out of folds CV f1score is 0.6442945431385723


In [81]:
def adaboost_inference(x_test):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def svm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'svm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def randomforest_inference(x_test):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        # モデルの読み込み
        model = pickle.load(open(f'randomforest_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # 予測
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def sgd_inference(x_test):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        # モデルの読み込み
        model = pickle.load(open(f'sgd_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # 予測 (SGDClassifierはデフォルトでpredict_probaをサポートしていない場合があるので注意)
        pred = model.decision_function(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds

def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds

def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'adaboost':
        test_pred = adaboost_inference(x_test)
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [82]:
test_df = Predicting(test_df, features, categorical_features)

In [83]:
#後処理の定義
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df

In [84]:
#後処理
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6901892477685019 0.517


In [90]:
test_df[['target']].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

以下、スタッキング   
oofのcsvを一度作ってしまえばこれ以下のセルを動かすだけでいい   
重み付けは一番上のリストの要素を変えることで可能

In [86]:
# 参考: CFG.METHOD_LIST = [ 'randomforest', 'adaboost','lightgbm', 'xgboost', 'catboost']
#以下のリストを色々変えて試す
method_list_adopted =['adaboost','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost']

# OOF予測を基に新たな特徴量を作成
oof_features = np.zeros((train_df.shape[0], len(method_list_adopted)))
for i, method in enumerate(method_list_adopted):
    oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    oof_features[:, i] = oof_df[f'{method}_prediction']

# テストデータの予測を基に特徴量を作成
test_features = np.zeros((test_df.shape[0], len(method_list_adopted)))
for i, method in enumerate(method_list_adopted):
    test_features[:, i] = test_df[f'{method}_pred_prob']

# 特徴量の標準化
scaler = StandardScaler()
oof_features_scaled = scaler.fit_transform(oof_features)
test_features_scaled = scaler.transform(test_features)

# ロジスティック回帰モデルを学習
lr = LogisticRegression()
lr.fit(oof_features_scaled, train_df[CFG.target_col])

# 最適な閾値とその時のF1スコアを探索する関数
def find_best_threshold_and_score(y_true, y_pred_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.linspace(0, 1, 1001):  # 0.001刻みで閾値を変更
        score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# ロジスティック回帰モデルの学習データに対する予測確率
train_pred_proba_lr = lr.predict_proba(oof_features_scaled)[:, 1]

# 最適な閾値とスコアを求める
best_threshold_lr, best_score_lr = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_lr)
print(f'LR Best Threshold: {best_threshold_lr}, Best F1 Score: {best_score_lr}')


LR Best Threshold: 0.8, Best F1 Score: 0.6891924429158547


In [89]:
# 参考: CFG.METHOD_LIST = ['sgd', 'randomforest', 'adaboost','lightgbm', 'xgboost', 'catboost', 'svm']
#以下のリストを色々変えて試す
method_list_adopted = ['catboost','lightgbm','adaboost', 'xgboost']

# OOF予測を基に新たな特徴量を作成
oof_features = np.zeros((train_df.shape[0], len(method_list_adopted)))
for i, method in enumerate(method_list_adopted):
    oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    oof_features[:, i] = oof_df[f'{method}_prediction']

# テストデータの予測を基に特徴量を作成
test_features = np.zeros((test_df.shape[0], len(method_list_adopted)))
for i, method in enumerate(method_list_adopted):
    test_features[:, i] = test_df[f'{method}_pred_prob']

def create_interaction_features(features):
    n_features = features.shape[1]
    interaction_features = []
    for i in range(n_features):
        for j in range(i + 1, n_features):
            interaction_features.append(features[:, i] * features[:, j])  # 積
            # interaction_features.append(features[:, i] / (features[:, j] + 1e-5))  # 比（ゼロ除算を避けるための小さな値を追加）
    return np.column_stack(interaction_features)

# OOF予測を基に新たな特徴量を作成（相互作用含む）
oof_interaction_features = create_interaction_features(oof_features)

# テストデータの予測を基に特徴量を作成（相互作用含む）
test_interaction_features = create_interaction_features(test_features)

# 元の特徴量と相互作用特徴量を組み合わせ
oof_combined_features = np.hstack([oof_features, oof_interaction_features])
test_combined_features = np.hstack([test_features, test_interaction_features])

# 特徴量の標準化
scaler = StandardScaler()
oof_combined_features_scaled = scaler.fit_transform(oof_combined_features)
test_combined_features_scaled = scaler.transform(test_combined_features)

# ロジスティック回帰モデルを学習
lr = LogisticRegression()
lr.fit(oof_combined_features_scaled, train_df[CFG.target_col])


# 最適な閾値とその時のF1スコアを探索する関数
def find_best_threshold_and_score(y_true, y_pred_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.linspace(0, 1, 1001):  # 0.001刻みで閾値を変更
        score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold, best_score

# ロジスティック回帰モデルの学習データに対する予測確率
train_pred_proba_lr = lr.predict_proba(oof_combined_features_scaled)[:, 1]

# 最適な閾値とスコアを求める
best_threshold_lr, best_score_lr = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_lr)
print(f'LR Best Threshold: {best_threshold_lr}, Best F1 Score: {best_score_lr}')


LR Best Threshold: 0.75, Best F1 Score: 0.6896358223103198


In [87]:
# テストデータに対する最終予測
test_pred_proba_lr = lr.predict_proba(test_features_scaled)[:, 1]
test_final_predictions_lr = (test_pred_proba_lr >= best_threshold_lr).astype(int)
# 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
submission_df_lr = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_lr}).reset_index(drop=True)
submission_df_lr['Id'] = submission_df_lr.index + 42307
submission_df_lr.to_csv(f'stacking_lr_submission_best_score{best_score_lr:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)

In [88]:
#少し試してみた感じ
#7つにしたとき組み合わせによらず値が同じになるのが面白い
#この7はn_folds=7と関係してるのかな

# 0.6883853294761878 ['adaboost','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost', 'catboost']
# 0.688452930421711  ['adaboost','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost']
# 0.6883853294761878 ['adaboost','lightgbm', 'xgboost', 'catboost', 'catboost']
# 0.6883515386513768 ['adaboost','lightgbm', 'xgboost', 'catboost']
# 0.6883853294761878 ['adaboost','lightgbm','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost']
# 0.6882502047393781 ['adaboost','lightgbm','lightgbm', 'xgboost', 'catboost', 'catboost']
# 0.6884191267321496 ['adaboost','adaboost','lightgbm', 'xgboost', 'catboost', 'catboost']
# 0.6883853294761878 ['adaboost','adaboost','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost']
# 0.6882839762854178 ['adaboost','adaboost','lightgbm', 'xgboost', 'catboost', 'catboost', 'catboost', 'catboost']