In [8]:
# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import re
import unicodedata
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 100
    AUTHOR = 'Yuta.K'
    COMPETITION = 'SignateCup2024Summer'
    DATA_PATH = Path('dataset')
    OOF_DATA_PATH = Path('oof')
    MODEL_DATA_PATH = Path('models')
    SUB_DATA_PATH = Path('submission')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
#     METHOD_LIST = [ 'adaboost','lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 2
    target_col = 'ProdTaken'
    metric = 'auc'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'random_state': seed,
    }
    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# AUC

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'auc', roc_auc_score(y_true, y_pred), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'auc', roc_auc_score(y_true, y_pred)

In [9]:
#データの読み込み
train_df = pd.read_csv(f'{CFG.DATA_PATH}/train.csv', index_col=0)
test_df = pd.read_csv(f'{CFG.DATA_PATH}/test.csv', index_col=0)

In [10]:
def preprocessing(train_df, test_df):
    # Age numeric
    # 漢数字とアラビア数字のマッピング
    kanji_to_num = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,'十': 10, '百': 100, '千': 1000, '万': 10000,'零': 0, '〇': 0}
    def kanji_to_arabic(kanji):
        result = 0
        temp = 0
        for char in kanji:
            value = kanji_to_num.get(char, None)
            if value is not None:
                if value < 10:
                    if temp == 0:
                        temp = value
                    else:
                        temp = temp * 10 + value
                elif value >= 10:
                    if temp == 0:
                        temp = 1
                    result += temp * value
                    temp = 0
        return result + temp
    def process_age(age):
        if age is None or str(age) == 'nan':
            return None
        age = unicodedata.normalize('NFKC', age)
        age = ''.join([c for c in age if c.isdigit() or c in kanji_to_num])
        if age.isdigit():
            return int(age)
        return kanji_to_arabic(age)

    # TypeofContact categorical(dummy)
    def TypeofContact_to_dummy(str):
        if str == 'Self Enquiry':
            return 1
        elif str == 'Company Invited':
            return 0
        
    # CityTier 順序尺度
    
    # DurationOfPitch numeric
    def convert_to_minutes(duration):
        # durationがfloat型またはNoneである可能性があるため、文字列であることを確認
        if pd.isnull(duration):
            return None  # NaNの場合、Noneを返す
        duration = str(duration)  # 文字列に変換してエラーを防ぐ
        if '分' in duration:
            return float(duration.replace('分', ''))
        elif '秒' in duration:
            return float(duration.replace('秒', '')) / 60  # 秒を分に変換し、整数で返す

    # Occupation categorical
    def Occupation_to_dummy(str):
        if str == 'Large Business':
            return 2
        elif str == 'Small Business':
            return 1
        elif str == 'Salaried':
            return 0
       
    # Gender categorical
    def Gender_dealing(gender):
        # 文字列を半角に変換し、大文字に統一
        gender = unicodedata.normalize('NFKC', gender).upper().strip()
        # 不要な空白を削除
        gender = ''.join(gender.split())

        if 'FEMALE' in gender:
            return 1
        elif 'MALE' in gender:
            return 0
        else:
            return None  # 性別が識別できない場合はNoneを返す
        
    # NumberOfPersonVisiting numeric
    
    # NumberOfFollowups numeric
    def NumberOfFollowups_dealing(input_int):
        if input_int >= 100:
            return input_int /100
        else:
            return input_int
    
    # ProductPitched categorical
    # Designation categorical
    def standardize_str(input_str):
        # 文字列を半角に変換し、小文字に統一
        input_str = unicodedata.normalize('NFKC', input_str).lower().strip()
        # 不要な空白や特殊記号を削除
        input_str = ''.join(input_str.split())
        input_str = input_str.replace('|', 'l').replace('×', 'x').replace('𝘤', 'c').replace('𝖺', 'a').replace('𝙳', 'd')
        # その他特殊文字を通常の英字に置換
        input_str = input_str.replace('ᗞ', 'd').replace('𐊡', 'a').replace('𝘳', 'r').replace('ꓢ', 's').replace('ı', 'i')
        input_str = input_str.replace('β', 'b').replace('в', 'b').replace('с', 'c').replace('տ', 's').replace('ς', 'c')
        input_str = input_str.replace('ꭰ', 'd').replace('ε', 'e').replace('ι', 'i').replace('α', 'a').replace('ո', 'n')
        input_str = input_str.replace('ѕ', 's').replace('μ', 'm').replace('е', 'e').replace('а', 'a').replace('ѵ', 'v')
        input_str = input_str.replace('aasic', 'basic')
        return input_str
    
    # PreferredPropertyStar 順序尺度
    
    # NumberOfTrips numeric
    def NumberOfTrips_dealing(str):
        if pd.isnull(str):
            return None 
        if '半年に' in str:
            return 2 * int(str.replace('半年に', '').replace('回', ''))
        elif '年に' in str:
            return int(str.replace('年に', '').replace('回', ''))
        elif '四半期に' in str:
            return 4 * int(str.replace('四半期に', '').replace('回', ''))
        else :
            return int(str)
        
    # Passport categorical(dummy)
    
    # PitchSatisfactionScore 順序尺度だけど間隔尺度的要素あり
    
    # MonthlyIncome numeric
    def MonthlyIncome_dealing(input_str):
        if pd.isnull(input_str):
            return None 
        if '月収' in input_str:
            return 10000 * float(input_str.replace('月収', '').replace('万円', ''))
        elif '万円' in input_str:
            return 10000 * float(input_str.replace('万円', ''))
        else:
            return float(input_str)
        
    # customer_info
    def customer_info_dealing(input_str):
        # 文字列を半角に変換し、小文字に統一
        input_str = unicodedata.normalize('NFKC', input_str).lower().strip()
        # 不要な空白や特殊記号を削除
        input_str = input_str.replace('/', ' ').replace('／', ' ').replace('、', ' ').replace('　', ' ')
        input_str = input_str.replace('\u3000', ' ').replace('\t', ' ').replace('\n', ' ')
        input_str = re.sub(r'(?<=\S)\s+(?=\S)', ',', input_str, count=2)
        return input_str
    
    # married categorical
    
    # car_possesion categorival(dummy)
    def car_possesion_dealing(input_str):
        if input_str in ['車未所持', '自動車未所有', '自家用車なし', '乗用車なし', '車なし', '車保有なし', 0]:
            return 0
        elif input_str in ['車所持', '自動車所有', '自家用車あり', '乗用車所持', '車保有', '車あり', 1]:
            return 1
        
    # offspring -1以外はnumeric
    def offspring_dealing(input_str):
        if '1' in input_str:
            return 1
        elif '2' in input_str:
            return 2
        elif '3' in input_str:
            return 3
        else:
            return 0
    
    def offspring_identified_dealing(input_str):
        if input_str in ['子供の数不明', '不明', 'わからない', '子育て状況不明', '子の数不詳']:
            return 1
        else:
            return 0
        
    def dealing_missing_values(input_df):
        df = input_df.copy()
        df['Age'] = df['Age'].fillna(df['Age'].mean())
        df['TypeofContact'] = df['TypeofContact'].fillna(df['TypeofContact'].median())
        df['DurationOfPitch'] = df['DurationOfPitch'].fillna(df['DurationOfPitch'].mean())
        df['NumberOfFollowups'] = df['NumberOfFollowups'].fillna(df['NumberOfFollowups'].mean())
        df['NumberOfTrips'] = df['NumberOfTrips'].fillna(df['NumberOfTrips'].mean())
        df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean())
        return df
    
    def dummy_ex(feature, train_df, test_df):
        # OneHotEncoder の初期化時に sparse_output 引数を使用
        ohe = OneHotEncoder(sparse_output=False, categories='auto')
        new_array = pd.concat([train_df[[feature]], test_df[[feature]]], axis=0)
        ohe.fit(new_array)

        # ダミー変数の列名の作成
        columns = [f'{feature}_{v}' for v in ohe.categories_[0]]

        # 生成されたダミー変数をデータフレームに変換
        dummy_vals_train = pd.DataFrame(ohe.transform(train_df[[feature]]), columns=columns)
        dummy_vals_test = pd.DataFrame(ohe.transform(test_df[[feature]]), columns=columns)

        # 残りの変数と結合
        tr = pd.concat([train_df.drop([feature], axis=1), dummy_vals_train.reset_index(drop=True)], axis=1)
        te = pd.concat([test_df.drop([feature], axis=1), dummy_vals_test.reset_index(drop=True)], axis=1)

        return tr, te
    
    def function_apply(input_df):
        df = input_df.copy()
        df['Age'] = df['Age'].apply(process_age)
        df['TypeofContact'] = df['TypeofContact'].apply(TypeofContact_to_dummy)
        df['DurationOfPitch'] = df['DurationOfPitch'].apply(convert_to_minutes)
        df['Occupation'] = df['Occupation'].apply(Occupation_to_dummy)
        df['Gender'] = df['Gender'].apply(Gender_dealing)
        df['NumberOfFollowups'] = df['NumberOfFollowups'].apply(NumberOfFollowups_dealing)
        df['ProductPitched'] = df['ProductPitched'].apply(standardize_str)
        df['NumberOfTrips'] = df['NumberOfTrips'].apply(NumberOfTrips_dealing)
        df['Designation'] = df['Designation'].apply(standardize_str)
        df['MonthlyIncome'] = df['MonthlyIncome'].apply(MonthlyIncome_dealing)
        df['customer_info'] = df['customer_info'].apply(customer_info_dealing)
        df[['married', 'car_possesion', 'offspring']] = df['customer_info'].str.split(',', n=2, expand=True)
        df = df.drop(['customer_info'],axis=1)
        df['car_possesion'] = df['car_possesion'].apply(car_possesion_dealing)
        df['offspring'] = df['offspring'].apply(offspring_dealing)
        df['offspring_identified'] = df['offspring'].apply(offspring_identified_dealing)
        df = dealing_missing_values(df)
        return df
    
    dummy_col = ['CityTier', 'Occupation', 'ProductPitched', 'PreferredPropertyStar', 'PitchSatisfactionScore', 'Designation', 'married']
    std_feature = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'NumberOfTrips', 'MonthlyIncome']
    
    def function_apply_both(train_df, test_df, std_feature):
        tr_df = train_df.copy()
        te_df = test_df.copy()
        tr_df = function_apply(tr_df)
        te_df = function_apply(te_df)
        for feature in dummy_col:
            tr_df, te_x = dummy_ex(feature, tr_df, te_df)
        std_sc = StandardScaler()
        tr_df[std_feature] = std_sc.fit_transform(tr_df[std_feature])
        te_df[std_feature] = std_sc.fit_transform(te_df[std_feature])
        return tr_df, te_df
    
    tr_df, te_df = function_apply_both(train_df, test_df, std_feature)
    
    print('Preprocessing finished')
    return tr_df, te_df

train_df, test_df = preprocessing(train_df, test_df)

Preprocessing finished


In [73]:
#Learning & Predicting

#1段階目の学習
def Pre_Learning(train_df,test_df, features, categorical_features):
    
    #adaboostでの学習メソッドの定義
    def adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        model = AdaBoostClassifier(**CFG.classification_adaboost_params)
        model.fit(x_train, y_train)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #lightgbmでの学習メソッドの定義
    def lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
        model = lgb.train(
                    params = CFG.classification_lgb_params,
                    train_set = lgb_train,
                    num_boost_round = CFG.num_boost_round,
                    valid_sets = [lgb_train, lgb_valid],
                    feval = lgb_metric,
                    callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                                  verbose=CFG.verbose)]
                )
        # Predict validation
        valid_pred = model.predict(x_valid)
        return model, valid_pred

    #xgboostでの学習メソッドの定義
    def xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
        model = xgb.train(
                    CFG.classification_xgb_params,
                    dtrain = xgb_train,
                    num_boost_round = CFG.num_boost_round,
                    evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                    early_stopping_rounds = CFG.early_stopping_round,
                    verbose_eval = CFG.verbose,
                    feval = xgb_metric,
                    maximize = CFG.metric_maximize_flag,
                )
        # Predict validation
        valid_pred = model.predict(xgb.DMatrix(x_valid))
        return model, valid_pred

    #catboostでの学習メソッドの定義
    def catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
        cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
        model = CatBoostClassifier(**CFG.classification_cat_params)
        model.fit(cat_train,
                  eval_set = [cat_valid],
                  early_stopping_rounds = CFG.early_stopping_round,
                  verbose = CFG.verbose,
                  use_best_model = True)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #任意のモデルでのクロスバリデーション学習メソッドの定義
    def gradient_boosting_model_cv_training(method, train_df, features, categorical_features):
        # Create a numpy array to store out of folds predictions
        oof_predictions = np.zeros(len(train_df))
        oof_fold = np.zeros(len(train_df))
        kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
        for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):
            print('-'*50)
            print(f'{method} training fold {fold+1}')

            x_train = train_df[features].iloc[train_index]
            y_train = train_df[CFG.target_col].iloc[train_index]
            x_valid = train_df[features].iloc[valid_index]
            y_valid = train_df[CFG.target_col].iloc[valid_index]

            model = None  # モデル変数を初期化する
            valid_pred = None

            if method == 'adaboost':
                model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'lightgbm':
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'xgboost':
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'catboost':
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)  
            # Save best model
            pickle.dump(model, open(f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
            # Add to out of folds array
            oof_predictions[valid_index] = valid_pred
            oof_fold[valid_index] = fold + 1
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Compute out of folds metric
        score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
        print(f'{method} our out of folds CV f1score is {score}')
        # Create a dataframe to store out of folds predictions
        oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
        oof_df.to_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

    #adaboostの学習済みモデル読み込み関数
    def adaboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #lightgbmの学習モデル読み込み関数
    def lightgbm_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(x_test)
            test_pred += pred
        return test_pred / CFG.n_folds

    #xgboostの学習モデル読み込み関数
    def xgboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(xgb.DMatrix(x_test))
            test_pred += pred
        return test_pred / CFG.n_folds

    #catboostの学習モデル読み込み関数
    def catboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #任意のメソッドに対して予測を返す関数
    def gradient_boosting_model_inference(method, test_df, features, categorical_features):
        x_test = test_df[features]
        if method == 'adaboost':
            test_pred = adaboost_inference(x_test)
        if method == 'lightgbm':
            test_pred = lightgbm_inference(x_test)
        if method == 'xgboost':
            test_pred = xgboost_inference(x_test)
        if method == 'catboost':
            test_pred = catboost_inference(x_test)
        return test_pred

    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, train_df, features, categorical_features)
        test_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)
        
        
#2段階目の学習　ニューラルネットワークによるスタッキング
def Post_Learning(train_df,test_df):
    #ニューラルネットワークモデル作成関数
    def create_nn_model(input_shape):
        model = Sequential([
            Dense(64, input_shape=(input_shape,)),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(32),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),

            Dense(1, activation='sigmoid')
        ])
        optimizer = Adam(lr=0.001)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model

    #ニューラルネットワーク用学習スケジューラー
    def scheduler(epoch, lr):
            if epoch < 10:
                return lr
            else:
                return lr * np.exp(-0.1)

    #特徴量同士で積を作る関数
    def create_interaction_features(features):
            n_features = features.shape[1]
            interaction_features = []
            for i in range(n_features):
                for j in range(i + 1, n_features):
                    interaction_features.append(features[:, i] * features[:, j])  
            return np.column_stack(interaction_features)
    
    # OOF予測を基に新たな特徴量を作成
    oof_features = np.zeros((train_df.shape[0], len(CFG.METHOD_LIST)))
    for i, method in enumerate(CFG.METHOD_LIST):
        oof_df = pd.read_csv(f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        oof_features[:, i] = oof_df[f'{method}_prediction']
    
    # テストデータの予測を基に特徴量を作成
    test_features = np.zeros((test_df.shape[0], len(CFG.METHOD_LIST)))
    for i, method in enumerate(CFG.METHOD_LIST):
        test_features[:, i] = test_df[f'{method}_pred_prob']

    # 特徴量同士の積を追加
    oof_interaction_features = create_interaction_features(oof_features)
    test_interaction_features = create_interaction_features(test_features)

    # 元の特徴量と相互作用特徴量を組み合わせ
    oof_combined_features = np.hstack([oof_features, oof_interaction_features])
    test_combined_features = np.hstack([test_features, test_interaction_features])

    # 特徴量の標準化
    global oof_combined_features_scaled, test_combined_features_scaled
    scaler = StandardScaler()
    oof_combined_features_scaled = scaler.fit_transform(oof_combined_features)
    test_combined_features_scaled = scaler.transform(test_combined_features)   
    
    # ニューラルネットワークモデルを学習
    nn_model = create_nn_model(oof_combined_features_scaled.shape[1])
    callbacks_list = [LearningRateScheduler(scheduler)]
    nn_model.fit(oof_combined_features_scaled, train_df[CFG.target_col],
                 validation_split=0.2, epochs=50, batch_size=32, callbacks=callbacks_list, verbose=1)
    nn_model.save(f'nn_stacking_model_seed{CFG.seed}_ver{CFG.VER}.h5')
    
    #ロジスティック回帰モデルを学習
    lr_model = LogisticRegression()
    lr_model.fit(oof_combined_features_scaled, train_df[CFG.target_col])
    pickle.dump(lr_model, open(f'lr_stacking_model_seed{CFG.seed}_ver{CFG.VER}.pkl','wb'))

def Learning_and_Predicting(train_df, test_df, features, categorical_features):
    Pre_Learning(train_df, test_df, features, categorical_features)
    Post_Learning(train_df, test_df)

In [74]:
#学習の実行
Learning_and_Predicting(train_df, test_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 18850, number of negative: 2303
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13822
[LightGBM] [Info] Number of data points in the train set: 21153, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.891127 -> initscore=2.102300
[LightGBM] [Info] Start training from score 2.102300
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[24]	training's auc: 0.897924	training's f1score: 0.584616	valid_1's auc: 0.753874	valid_1's f1score: 0.544304
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Number of positive: 18917, number of negative: 2237
[LightGBM] [In

In [75]:
#Postprocessing

def Postprocessing(train_df, test_df):
    #最適な閾値を見つける関数
    def find_best_threshold_and_score(y_true, y_pred_proba):
        best_threshold = 0
        best_score = 0
        for threshold in np.linspace(0, 1, 1001):
            score = f1_score(y_true, y_pred_proba >= threshold, average='macro')
            if score > best_score:
                best_score = score
                best_threshold = threshold
        return best_threshold, best_score
    
    # ニューラルネットワークモデルの学習データに対する予測確率
    nn_model = load_model(f'nn_stacking_model_seed{CFG.seed}_ver{CFG.VER}.h5')
    train_pred_proba_nn = nn_model.predict(train_df,oof_combined_features_scaled).flatten()
    
    # 最適な閾値とスコアを求める
    best_threshold_nn, best_score_nn = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_nn)
    print(f'NN Best Threshold: {best_threshold_nn}, Best F1 Score: {best_score_nn}')
    
    # テストデータに対する最終予測
    test_pred_proba_nn = nn_model.predict(test_combined_features_scaled).flatten()
    test_final_predictions_nn = (test_pred_proba_nn >= best_threshold_nn).astype(int)
    # 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
    submission_df_nn = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_nn}).reset_index(drop=True)
    submission_df_nn['Id'] = submission_df_nn.index + 4230
    submission_df_nn.to_csv(f'stacking_nn_submission_best_score{best_score_nn:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)
    
    
    # ロジスティック回帰モデルの学習データに対する予測確率
    lr_model = pickle.load(open(f'lr_stacking_model_seed{CFG.seed}_ver{CFG.VER}.pkl','rb'))
    train_pred_proba_lr = lr_model.predict_proba(oof_combined_features_scaled)[:, 1]
    
    # 最適な閾値とスコアを求める
    best_threshold_lr, best_score_lr = find_best_threshold_and_score(train_df[CFG.target_col], train_pred_proba_nn)
    print(f'LR Best Threshold: {best_threshold_lr}, Best F1 Score: {best_score_lr}')
    
    # テストデータに対する最終予測
    test_pred_proba_lr = lr_model.predict_proba(test_combined_features_scaled)[:, 1]
    test_final_predictions_lr = (test_pred_proba_lr >= best_threshold_lr).astype(int)
    # 最終予測結果をコンペ提出用のフォーマットでCSVファイルに出力
    submission_df_lr = pd.DataFrame({'Id': test_df.index, 'target': test_final_predictions_lr}).reset_index(drop=True)
    submission_df_lr['Id'] = submission_df_lr.index + 4230
    submission_df_lr.to_csv(f'stacking_lr_submission_best_score{best_score_lr:.4f}_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}.csv', header=False, index=False)

In [76]:
#予測の実行
Postprocessing(train_df, test_df)

TypeError: Postprocessing() takes 0 positional arguments but 3 were given