In [20]:

# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import unicodedata
import warnings
warnings.filterwarnings('ignore')
import random
import copy
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 11
    AUTHOR = 'Yuta.K'
    COMPETITION = 'SC2024'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['catboost']
    seed = 42
    n_folds = 7
    target_col = 'ProdTaken'
    metric = 'AUC'
    metric_maximize_flag = True
    num_boost_round = 300
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves' : 15,
        'lambda_l1' : 0.2,
        'lambda_l2' : 0.2,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'lambda':2,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.1,
        'depth':1,
        'l2_leaf_reg' : 6,
        'iterations':1000,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    
    model_weight_dict = {'adaboost': 0.10,'lightgbm': 0.24, 'xgboost': 0.04, 'catboost': 0.72}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# AUC

In [33]:
#データの読み込み
train_df = pd.read_csv('dataset/train_processed.csv', index_col=0)
test_df = pd.read_csv('dataset/test_processed.csv', index_col=0)
test0 = pd.read_csv('dataset/test.csv', index_col=0)

In [34]:
train_df

Unnamed: 0,Age,TypeofContact,DurationOfPitch,Gender,NumberOfPersonVisiting,NumberOfFollowups,NumberOfTrips,Passport,MonthlyIncome,CarPossesion,NumberOfOffspring,family_members,MonthlyIncome / Age,DurationOfPitch * NumberOfFollowups,MonthlyIncome / family_members,NumberOfPersonVisiting * NumberOfTrips,DurationOfPitch * NumberOfPersonVisiting,Age / NumberOfTrips,PreferredPropertyStar / MonthlyIncome,Income / child,MoneyforOneTrip,AllOfcontact,CityTier_1.0,CityTier_2.0,CityTier_3.0,Occupation_0.0,Occupation_1.0,Occupation_2.0,ProductPitched_0.0,ProductPitched_1.0,ProductPitched_2.0,ProductPitched_3.0,ProductPitched_4.0,PreferredPropertyStar_3.0,PreferredPropertyStar_4.0,PreferredPropertyStar_5.0,PitchSatisfactionScore_1.0,PitchSatisfactionScore_2.0,PitchSatisfactionScore_3.0,PitchSatisfactionScore_4.0,PitchSatisfactionScore_5.0,Designation_0.0,Designation_1.0,Designation_2.0,Designation_3.0,Designation_4.0,Married_0.0,Married_1.0,Married_2.0,Married_3.0,ProdTaken
0,50.000000,1.0,15.0,0.0,1.0,4.0,5.0,1.0,253905.0,0.0,0.0,1.0,5078.100000,60.0,253905.00,5.0,15.0,10.000000,1.181544,2.539050e+09,6.093720e+05,23.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
1,56.000000,0.0,14.0,0.0,1.0,4.0,2.0,1.0,404475.0,1.0,0.0,1.0,7222.767857,56.0,404475.00,2.0,14.0,28.000000,0.741702,4.044750e+09,2.426850e+06,22.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,35.512905,1.0,10.0,1.0,1.0,3.0,4.0,0.0,278145.0,0.0,0.0,2.0,7832.223270,30.0,139072.50,4.0,10.0,8.878226,1.078574,2.781450e+09,8.344350e+05,16.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,37.000000,1.0,18.0,1.0,1.0,3.0,1.0,0.0,326805.0,1.0,0.0,1.0,8832.567568,54.0,326805.00,1.0,18.0,37.000000,1.223971,3.268050e+09,3.921660e+06,24.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,48.000000,0.0,17.0,1.0,1.0,3.0,4.0,0.0,258435.0,1.0,0.0,1.0,5384.062500,51.0,258435.00,4.0,17.0,12.000000,1.547778,2.584350e+09,7.753050e+05,23.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,40.000000,1.0,26.0,0.0,2.0,3.0,3.0,0.0,258900.0,1.0,0.0,1.0,6472.500000,78.0,258900.00,6.0,52.0,13.333333,1.158749,2.589000e+09,1.035600e+06,32.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
3485,35.349480,1.0,9.0,0.0,3.0,3.0,5.0,0.0,260415.0,1.0,2.0,4.0,7366.869255,27.0,65103.75,15.0,27.0,7.069896,1.920012,1.302010e+05,6.249960e+05,15.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3486,31.000000,1.0,14.0,1.0,3.0,2.0,5.0,0.0,317340.0,1.0,1.0,2.0,10236.774194,28.0,158670.00,15.0,42.0,6.200000,0.945358,3.173083e+05,7.616160e+05,18.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3487,56.000000,0.0,15.0,0.0,3.0,6.0,7.0,1.0,527910.0,0.0,2.0,4.0,9426.964286,90.0,131977.50,21.0,45.0,8.000000,0.568279,2.639418e+05,9.049886e+05,27.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [35]:
test_df

Unnamed: 0,Age,TypeofContact,DurationOfPitch,Gender,NumberOfPersonVisiting,NumberOfFollowups,NumberOfTrips,Passport,MonthlyIncome,CarPossesion,NumberOfOffspring,family_members,MonthlyIncome / Age,DurationOfPitch * NumberOfFollowups,MonthlyIncome / family_members,NumberOfPersonVisiting * NumberOfTrips,DurationOfPitch * NumberOfPersonVisiting,Age / NumberOfTrips,PreferredPropertyStar / MonthlyIncome,Income / child,MoneyforOneTrip,AllOfcontact,CityTier_1.0,CityTier_2.0,CityTier_3.0,Occupation_0.0,Occupation_1.0,Occupation_2.0,ProductPitched_0.0,ProductPitched_1.0,ProductPitched_2.0,ProductPitched_3.0,ProductPitched_4.0,PreferredPropertyStar_3.0,PreferredPropertyStar_4.0,PreferredPropertyStar_5.0,PitchSatisfactionScore_1.0,PitchSatisfactionScore_2.0,PitchSatisfactionScore_3.0,PitchSatisfactionScore_4.0,PitchSatisfactionScore_5.0,Designation_0.0,Designation_1.0,Designation_2.0,Designation_3.0,Designation_4.0,Married_0.0,Married_1.0,Married_2.0,Married_3.0
0,48.000000,1.0,13.0,0.0,1.0,4.0,7.0,0.0,496950.0,1.0,0.0,2.0,10353.125000,52.0,248475.00,7.0,13.0,6.857143,0.603682,4.969500e+09,8.519143e+05,21.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,38.334903,1.0,12.0,1.0,1.0,4.0,4.0,1.0,300000.0,0.0,0.0,2.0,7825.766556,48.0,150000.00,4.0,12.0,9.583726,1.000000,3.000000e+09,9.000000e+05,20.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,25.000000,1.0,9.0,1.0,1.0,4.0,1.0,0.0,260000.0,0.0,0.0,1.0,10400.000000,36.0,260000.00,1.0,9.0,25.000000,1.153846,2.600000e+09,3.120000e+06,17.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,21.000000,0.0,7.0,0.0,1.0,4.0,1.0,0.0,259875.0,1.0,0.0,1.0,12375.000000,28.0,259875.00,1.0,7.0,21.000000,1.539202,2.598750e+09,3.118500e+06,15.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,41.000000,0.0,7.0,0.0,1.0,4.0,1.0,0.0,268830.0,1.0,0.0,1.0,6556.829268,28.0,268830.00,1.0,7.0,41.000000,1.115947,2.688300e+09,3.225960e+06,15.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,41.000000,0.0,14.0,1.0,1.0,3.0,2.0,0.0,261840.0,0.0,0.0,2.0,6386.341463,42.0,130920.00,2.0,14.0,20.500000,1.145738,2.618400e+09,1.571040e+06,20.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3485,44.000000,0.0,35.0,0.0,3.0,5.0,3.0,0.0,349770.0,1.0,1.0,3.0,7949.318182,175.0,116590.00,9.0,105.0,14.666667,0.857706,3.497350e+05,1.399080e+06,45.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3486,24.000000,1.0,21.0,0.0,2.0,3.0,2.0,0.0,270000.0,0.0,0.0,1.0,11250.000000,63.0,270000.00,4.0,42.0,12.000000,1.111111,2.700000e+09,1.620000e+06,27.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3487,25.000000,1.0,9.0,0.0,2.0,3.0,2.0,0.0,272430.0,0.0,1.0,3.0,10897.200000,27.0,90810.00,4.0,18.0,12.500000,1.101200,2.724028e+05,1.634580e+06,15.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [36]:
train_df.columns

Index(['Age', 'TypeofContact', 'DurationOfPitch', 'Gender',
       'NumberOfPersonVisiting', 'NumberOfFollowups', 'NumberOfTrips',
       'Passport', 'MonthlyIncome', 'CarPossesion', 'NumberOfOffspring',
       'family_members', 'MonthlyIncome / Age',
       'DurationOfPitch * NumberOfFollowups', 'MonthlyIncome / family_members',
       'NumberOfPersonVisiting * NumberOfTrips',
       'DurationOfPitch * NumberOfPersonVisiting', 'Age / NumberOfTrips',
       'PreferredPropertyStar / MonthlyIncome', 'Income / child',
       'MoneyforOneTrip', 'AllOfcontact', 'CityTier_1.0', 'CityTier_2.0',
       'CityTier_3.0', 'Occupation_0.0', 'Occupation_1.0', 'Occupation_2.0',
       'ProductPitched_0.0', 'ProductPitched_1.0', 'ProductPitched_2.0',
       'ProductPitched_3.0', 'ProductPitched_4.0', 'PreferredPropertyStar_3.0',
       'PreferredPropertyStar_4.0', 'PreferredPropertyStar_5.0',
       'PitchSatisfactionScore_1.0', 'PitchSatisfactionScore_2.0',
       'PitchSatisfactionScore_3.0', 'Pitch

In [41]:
#学習に必要となるリストの作成
default_categorical_features = ['TypeofContact', 'CityTier', 'Occupation', 'Gender', 'ProductPitched', 'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation', 'Married', 'CarPossesion']
default_numerical_features = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'NumberOfTrips', 'MonthlyIncome', 'NumberOfOffspring']
added_numerical_features = ['family_members', 'MonthlyIncome / Age', 'DurationOfPitch * NumberOfFollowups', 'MonthlyIncome / family_members', 'NumberOfPersonVisiting * NumberOfTrips',
                           'DurationOfPitch * NumberOfPersonVisiting', 'Age / NumberOfTrips', 'PreferredPropertyStar / MonthlyIncome', 'Income / child', 'MoneyforOneTrip', 'AllOfcontact']
numerical_features = default_numerical_features + added_numerical_features

#特徴量の指定
features = train_df.columns.tolist()
#カテゴリカル特徴量の指定
categorical_features = copy.deepcopy(features)
print(f'numerical_features {numerical_features}')
for i in numerical_features:
    categorical_features.remove(i)
print(f'categorical_features {categorical_features}')

#学習に使用しない特徴量は以下で除外
RemoveList=[CFG.target_col, 'DurationOfPitch * NumberOfFollowups']
for i in RemoveList:
    if i in numerical_features:
        numerical_features.remove(i)
        features.remove(i)
    elif i in categorical_features:
        categorical_features.remove(i)
        features.remove(i)
    else:
        features.remove(i)
print(f'features for training:{features}')

for col in categorical_features:
    train_df[col] = train_df[col].astype(int)
    test_df[col] = test_df[col].astype(int)

model = xgb.XGBClassifier(eval_metric = 'auc')
model.fit(train_df[features],train_df[CFG.target_col])
explainer = shap.Explainer(model)
shap_values = explainer(train_df[features])
shap.plots.waterfall(shap_values[0])

shap.plots.bar(shap_values)

features for training:['Age', 'TypeofContact', 'DurationOfPitch', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'NumberOfTrips', 'Passport', 'MonthlyIncome', 'CarPossesion', 'NumberOfOffspring', 'family_members', 'MonthlyIncome / Age', 'MonthlyIncome / family_members', 'NumberOfPersonVisiting * NumberOfTrips', 'DurationOfPitch * NumberOfPersonVisiting', 'Age / NumberOfTrips', 'PreferredPropertyStar / MonthlyIncome', 'Income / child', 'MoneyforOneTrip', 'AllOfcontact', 'CityTier_1.0', 'CityTier_2.0', 'CityTier_3.0', 'Occupation_0.0', 'Occupation_1.0', 'Occupation_2.0', 'ProductPitched_0.0', 'ProductPitched_1.0', 'ProductPitched_2.0', 'ProductPitched_3.0', 'ProductPitched_4.0', 'PreferredPropertyStar_3.0', 'PreferredPropertyStar_4.0', 'PreferredPropertyStar_5.0', 'PitchSatisfactionScore_1.0', 'PitchSatisfactionScore_2.0', 'PitchSatisfactionScore_3.0', 'PitchSatisfactionScore_4.0', 'PitchSatisfactionScore_5.0', 'Designation_0.0', 'Designation_1.0', 'Designation_2.0', 'Designati

ValueError: list.remove(x): x not in list

In [42]:
#Learning & Predicting

#1段階目の学習
def Pre_Learning(train_df,test_df, features, categorical_features):
    
    #adaboostでの学習メソッドの定義
    def adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        model = AdaBoostClassifier(**CFG.classification_adaboost_params)
        model.fit(x_train, y_train)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #lightgbmでの学習メソッドの定義
    def lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
        model = lgb.train(
                    params = CFG.classification_lgb_params,
                    train_set = lgb_train,
                    num_boost_round = CFG.num_boost_round,
                    valid_sets = [lgb_train, lgb_valid],
                    callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                                  verbose=CFG.verbose)]
                )
        # Predict validation
        valid_pred = model.predict(x_valid)
        return model, valid_pred

    #xgboostでの学習メソッドの定義
    def xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
        model = xgb.train(
                    CFG.classification_xgb_params,
                    dtrain = xgb_train,
                    num_boost_round = CFG.num_boost_round,
                    evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                    early_stopping_rounds = CFG.early_stopping_round,
                    verbose_eval = CFG.verbose,
                    maximize = CFG.metric_maximize_flag,
                )
        # Predict validation
        valid_pred = model.predict(xgb.DMatrix(x_valid))
        return model, valid_pred

    #catboostでの学習メソッドの定義
    def catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
        cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
        model = CatBoostClassifier(**CFG.classification_cat_params)
        model.fit(cat_train,
                  eval_set = [cat_valid],
                  early_stopping_rounds = CFG.early_stopping_round,
                  verbose = CFG.verbose,
                  use_best_model = True)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred



    #任意のモデルでのクロスバリデーション学習メソッドの定義
    def gradient_boosting_model_cv_training(method, train_df, features, categorical_features):
        # Create a numpy array to store out of folds predictions
        oof_predictions = np.zeros(len(train_df))
        oof_fold = np.zeros(len(train_df))
        kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
        for fold, (train_index, valid_index) in enumerate(kfold.split(train_df[features],train_df[CFG.target_col])):
            print('-'*50)
            print(f'{method} training fold {fold+1}')

            x_train = train_df[features].iloc[train_index]
            y_train = train_df[CFG.target_col].iloc[train_index]
            x_valid = train_df[features].iloc[valid_index]
            y_valid = train_df[CFG.target_col].iloc[valid_index]

            model = None  # モデル変数を初期化する
            valid_pred = None

            if method == 'adaboost':
                model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'lightgbm':
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'xgboost':
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'catboost':
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)  
            # Save best model
            pickle.dump(model, open(f'model/{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
            # Add to out of folds array
            oof_predictions[valid_index] = valid_pred
            oof_fold[valid_index] = fold + 1
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Compute out of folds metric
        #score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
        score = roc_auc_score(train_df[CFG.target_col], oof_predictions)
        print(f'{method} our out of folds CV AUC is {score}')
        # Create a dataframe to store out of folds predictions
        oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
        oof_df.to_csv(f'oof/oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

    #adaboostの学習済みモデル読み込み関数
    def adaboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #lightgbmの学習モデル読み込み関数
    def lightgbm_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(x_test)
            test_pred += pred
        return test_pred / CFG.n_folds

    #xgboostの学習モデル読み込み関数
    def xgboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(xgb.DMatrix(x_test))
            test_pred += pred
        return test_pred / CFG.n_folds

    #catboostの学習モデル読み込み関数
    def catboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #任意のメソッドに対して予測を返す関数
    def gradient_boosting_model_inference(method, test_df, features, categorical_features):
        x_test = test_df[features]
        if method == 'adaboost':
            test_pred = adaboost_inference(x_test)
        if method == 'lightgbm':
            test_pred = lightgbm_inference(x_test)
        if method == 'xgboost':
            test_pred = xgboost_inference(x_test)
        if method == 'catboost':
            test_pred = catboost_inference(x_test)
        return test_pred

    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, train_df, features, categorical_features)
        test_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)
        
        

Pre_Learning(train_df,test_df, features, categorical_features)

test_df['target'] = 0
for method in CFG.METHOD_LIST:
    test_df['target'] += test_df[f'{method}_pred_prob']*CFG.model_weight_dict[method]

id_list = [i+3489 for i in range(3489)]

submission = pd.DataFrame({'ID':id_list, 'ProdTaken':test_df['target']})
submission.to_csv(f'prediction/seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', index=False, header=None)

--------------------------------------------------
catboost training fold 1


CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=13]=253905.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.