In [1]:
# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import unicodedata
import warnings
warnings.filterwarnings('ignore')
import random
import copy
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 14
    AUTHOR = 'Naoki'
    COMPETITION = 'SC2024'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['catboost']
    seed = 42
    n_folds = 7
    target_col = 'ProdTaken'
    metric = 'AUC'
    metric_maximize_flag = True
    num_boost_round = 300
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.10,
        'lambda_l1' : 10,
        'lambda_l2' : 100,
        'max_depth':2,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.20,
        'lambda':1000,
        'alpha':1,
        'max_depth':2,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.10,
        'depth':1,
        'l2_leaf_reg' : 6,
        'iterations':1000,
        'random_seed': seed,
        'one_hot_max_size':40,
        
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 0.5,
        'random_state': 42,
    }
    classification_randomforest_params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'auto',
        'bootstrap': True,
        'random_state': 42
    }
    
    model_weight_dict = {'catboost': 1.00}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# AUC

#データの読み込み
train_df = pd.read_csv('data/train_processed.csv', index_col=0)
test_df = pd.read_csv('data/test_processed.csv', index_col=0)

#学習に必要となるリストの作成
LabelList = ['TypeofContact','car_possesion','Passport','Gender','PitchSatisfactionScore']
#OneHotList = ['CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
OneHotList = []
default_categorical_features = ['TypeofContact','car_possesion','Passport','Gender','PitchSatisfactionScore',
                                'CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
default_numerical_features = ['Age','DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips',
                              'MonthlyIncome','offspring']
NumericalList = ['Age','DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips','MonthlyIncome','offspring',
                 'family_members','ChildRate', 'MoneyforOneTrip','AllOfcontact','Income_child','Income_person','Income_Age','TripFee',
                'TripFeeAllyear','Income_TripFee','Income_TripFeeAllyear','AdultMembers']

MissList = ['Age','TypeofContact','DurationOfPitch','NumberOfFollowups','NumberOfTrips','MonthlyIncome']

def Preprocessing(train_df, test_df):
    
    def miss_dealing(train_df,test_df):
        #ラベルエンコーディング
        LabelList = ['TypeofContact','Occupation','ProductPitched','Designation','married']
        for col in LabelList:
            encoder = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0)
            encoder.fit(combined)
            train_df[col] = encoder.transform(train_df[col])
            test_df[col] = encoder.transform(test_df[col])
        features = ['Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender','NumberOfPersonVisiting','NumberOfFollowups',
                    'ProductPitched','PreferredPropertyStar','NumberOfTrips','Passport','PitchSatisfactionScore','Designation',
                    'MonthlyIncome','married','car_possesion','offspring']
        train_x = train_df[features]
        imputer = IterativeImputer(max_iter=10, random_state=0)
        imputer.fit(train_x)
        train_x_imputed = imputer.transform(train_x)
        train_df_imputed = pd.DataFrame(train_x_imputed, columns=train_x.columns)
        train_df_imputed[CFG.target_col] = train_df[CFG.target_col]
        test_df_imputed = imputer.transform(test_df)
        test_df_imputed = pd.DataFrame(test_df_imputed, columns=test_df.columns)
        original_index = list(range(3489,6978))
        test_df_imputed.index = original_index
        return train_df_imputed,test_df_imputed
    
    #特徴量作成
    def make_features(input_df):
        df = input_df.copy()
        def count_adult_members(input_str):
            if input_str == 1.0:
                return 2
            else:
                return 1
        print(train_df)
        print(test_df)
        df['MonthlyIncome'] = df['MonthlyIncome']//1000
        ProductPitched_Dict = {0.0:1,3.0:2,1.0:3,4.0:4,2.0:5}
        df['ProductPitched'] = df['ProductPitched'].map(ProductPitched_Dict)
        df['family_members'] = df['married'].apply(count_adult_members) + df['offspring']
        #df['AdultRate'] = 1-df['offspring']/df['NumberOfPersonVisiting']
        df['AdultMembers'] = df['NumberOfPersonVisiting']-df['offspring']
        df['family_ChildRate'] = df['offspring']/df['family_members']
        df['Income_person'] = df['MonthlyIncome']/df['family_members']
        df['Income_child'] =  df['MonthlyIncome']/(df['offspring']+0.0001)
        df['MoneyforOneTrip'] = df['MonthlyIncome']*12/df['NumberOfTrips']
        df['AllOfcontact'] = df['DurationOfPitch'] + df['NumberOfFollowups']*2
        df['Income_Age'] = df['MonthlyIncome'] / df['Age']

        #組み合わせ特徴量
        pair_combination_list = []
        combination_list = []
        pair_combination_list = [('Designation','ProductPitched')]
        #pair_combination_list = [('Designation','ProductPitched'),('married','Passport')]
        #combination_list = [('Passport','car_possesion','married')]
        for a,b in pair_combination_list:
            df[f'{a}_{b}'] = df[a].astype(str) + '_' + df[b].astype(str)
        for a,b,c in combination_list:
            df[f'{a}_{b}_{c}'] = df[a].astype(str) + '_' + df[b].astype(str) + '_' + df[c].astype(str)
            
        return df
        
    train_df, test_df = miss_dealing(train_df, test_df)
    train_df = make_features(train_df)
    test_df = make_features(test_df)
    print(train_df)
    train_df.info()
    # train_df, test_df = encoding(train_df, test_df)
    return train_df, test_df
    
#前処理の実行
train_df, test_df = Preprocessing(train_df,test_df)



#特徴量の指定
features = train_df.columns.tolist()
#学習に使用しない特徴量は以下で除外
RemoveList=[CFG.target_col]
for i in RemoveList:
    features.remove(i)
print(f'features for training:{features}')

#カテゴリカル特徴量の指定
categorical_features = copy.deepcopy(features)
print(NumericalList)
for i in NumericalList:
    if i in categorical_features:
        categorical_features.remove(i)
print(categorical_features)

for col in categorical_features:
    if train_df[col].dtype == 'float64':
        train_df[col] = train_df[col].astype(int)
        test_df[col] = test_df[col].astype(int)


train_df.info()
test_df.info()







FileNotFoundError: [Errno 2] No such file or directory: 'data/train_processed.csv'

basic 4 standard 6 deluxe 9 superdeluxe 13.5 king 20
star 3:0.8 4:2.2 5:6

In [10]:
#Learning & Predicting

#1段階目の学習
def Pre_Learning(train_df,test_df, features, categorical_features):

    def randomforest_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        # モデルのパラメータは適切に設定する
        model = RandomForestClassifier(**CFG.classification_randomforest_params)
        model.fit(x_train, y_train)
        # バリデーションデータに対する予測確率を計算
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #adaboostでの学習メソッドの定義
    def adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        model = AdaBoostClassifier(**CFG.classification_adaboost_params)
        model.fit(x_train, y_train)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred

    #lightgbmでの学習メソッドの定義
    def lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
        model = lgb.train(
                    params = CFG.classification_lgb_params,
                    train_set = lgb_train,
                    num_boost_round = CFG.num_boost_round,
                    valid_sets = [lgb_train, lgb_valid],
                    callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                                  verbose=CFG.verbose)]
                )
        # Predict validation
        valid_pred = model.predict(x_valid)
        return model, valid_pred

    #xgboostでの学習メソッドの定義
    def xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        xgb_train = xgb.DMatrix(data=x_train, label=y_train)
        xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
        model = xgb.train(
                    CFG.classification_xgb_params,
                    dtrain = xgb_train,
                    num_boost_round = CFG.num_boost_round,
                    evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                    early_stopping_rounds = CFG.early_stopping_round,
                    verbose_eval = CFG.verbose,
                    maximize = CFG.metric_maximize_flag,
                )
        # Predict validation
        valid_pred = model.predict(xgb.DMatrix(x_valid))
        return model, valid_pred

    #catboostでの学習メソッドの定義
    def catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
        cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
        cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
        model = CatBoostClassifier(**CFG.classification_cat_params)
        model.fit(cat_train,
                  eval_set = [cat_valid],
                  early_stopping_rounds = CFG.early_stopping_round,
                  verbose = CFG.verbose,
                  use_best_model = True)
        # Predict validation
        valid_pred = model.predict_proba(x_valid)[:, 1]
        return model, valid_pred



    #任意のモデルでのクロスバリデーション学習メソッドの定義
    def gradient_boosting_model_cv_training(method, train_df, features, categorical_features):
        # Create a numpy array to store out of folds predictions
        oof_predictions = np.zeros(len(train_df))
        oof_fold = np.zeros(len(train_df))
        kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
        for fold, (train_index, valid_index) in enumerate(kfold.split(train_df[features],train_df[CFG.target_col])):
            print('-'*50)
            print(f'{method} training fold {fold+1}')

            x_train = train_df[features].iloc[train_index]
            y_train = train_df[CFG.target_col].iloc[train_index]
            x_valid = train_df[features].iloc[valid_index]
            y_valid = train_df[CFG.target_col].iloc[valid_index]

            model = None  # モデル変数を初期化する
            valid_pred = None
            
            if method == 'randomforest':
                model, valid_pred = randomforest_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'adaboost':
                model, valid_pred = adaboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'lightgbm':
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'xgboost':
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == 'catboost':
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)  
            # Save best model
            pickle.dump(model, open(f'model/{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
            # Add to out of folds array
            oof_predictions[valid_index] = valid_pred
            oof_fold[valid_index] = fold + 1
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Compute out of folds metric
        score = roc_auc_score(train_df[CFG.target_col], oof_predictions)
        print(f'{method} our out of folds CV AUC is {score}')
        # Create a dataframe to store out of folds predictions
        oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
        oof_df.to_csv(f'oof/oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

    #randomforestの学習済みモデル読み込み関数
    def randomforest_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/randomforest_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #adaboostの学習済みモデル読み込み関数
    def adaboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/adaboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #lightgbmの学習モデル読み込み関数
    def lightgbm_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(x_test)
            test_pred += pred
        return test_pred / CFG.n_folds

    #xgboostの学習モデル読み込み関数
    def xgboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict(xgb.DMatrix(x_test))
            test_pred += pred
        return test_pred / CFG.n_folds

    #catboostの学習モデル読み込み関数
    def catboost_inference(x_test):
        test_pred = np.zeros(len(x_test))
        for fold in range(CFG.n_folds):
            model = pickle.load(open(f'model/catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
            # Predict
            pred = model.predict_proba(x_test)[:, 1]
            test_pred += pred
        return test_pred / CFG.n_folds

    #任意のメソッドに対して予測を返す関数
    def gradient_boosting_model_inference(method, test_df, features, categorical_features):
        x_test = test_df[features]
        if method == 'randomforest':
            test_pred = randomforest_inference(x_test)
        if method == 'adaboost':
            test_pred = adaboost_inference(x_test)
        if method == 'lightgbm':
            test_pred = lightgbm_inference(x_test)
        if method == 'xgboost':
            test_pred = xgboost_inference(x_test)
        if method == 'catboost':
            test_pred = catboost_inference(x_test)
        return test_pred

    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, train_df, features, categorical_features)
        test_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, test_df, features, categorical_features)
        
        

Pre_Learning(train_df,test_df, features, categorical_features)

test_df['target'] = 0
for method in CFG.METHOD_LIST:
    test_df['target'] += test_df[f'{method}_pred_prob']*CFG.model_weight_dict[method]
    


--------------------------------------------------
catboost training fold 1
0:	learn: 0.6266713	test: 0.6256005	best: 0.6256005 (0)	total: 1.96ms	remaining: 1.96s
25:	learn: 0.3372874	test: 0.3339366	best: 0.3339366 (25)	total: 37.1ms	remaining: 1.39s
50:	learn: 0.3182567	test: 0.3177697	best: 0.3177697 (50)	total: 68ms	remaining: 1.26s
75:	learn: 0.3087168	test: 0.3088984	best: 0.3087920 (74)	total: 99.5ms	remaining: 1.21s
100:	learn: 0.3026135	test: 0.3053619	best: 0.3053619 (100)	total: 132ms	remaining: 1.18s
125:	learn: 0.2986934	test: 0.3037865	best: 0.3036767 (121)	total: 166ms	remaining: 1.15s
150:	learn: 0.2951815	test: 0.3014562	best: 0.3012390 (145)	total: 199ms	remaining: 1.12s
175:	learn: 0.2923099	test: 0.2988954	best: 0.2986830 (169)	total: 230ms	remaining: 1.08s
200:	learn: 0.2903692	test: 0.2980140	best: 0.2977532 (195)	total: 262ms	remaining: 1.04s
225:	learn: 0.2882616	test: 0.2971395	best: 0.2967319 (224)	total: 295ms	remaining: 1.01s
250:	learn: 0.2867822	test: 0.29

275:	learn: 0.2836310	test: 0.3005585	best: 0.2998436 (142)	total: 367ms	remaining: 964ms
300:	learn: 0.2821052	test: 0.3007558	best: 0.2998436 (142)	total: 399ms	remaining: 928ms
325:	learn: 0.2809393	test: 0.3008542	best: 0.2998436 (142)	total: 431ms	remaining: 892ms
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2998435975
bestIteration = 142

Shrink model to first 143 iterations.
--------------------------------------------------
catboost training fold 4
0:	learn: 0.6210322	test: 0.6227659	best: 0.6227659 (0)	total: 3.32ms	remaining: 3.32s
25:	learn: 0.3332813	test: 0.3412563	best: 0.3412563 (25)	total: 40.3ms	remaining: 1.51s
50:	learn: 0.3142802	test: 0.3269515	best: 0.3269515 (50)	total: 71.8ms	remaining: 1.33s
75:	learn: 0.3047076	test: 0.3195578	best: 0.3195578 (75)	total: 104ms	remaining: 1.26s
100:	learn: 0.2990286	test: 0.3162530	best: 0.3162530 (100)	total: 138ms	remaining: 1.22s
125:	learn: 0.2951340	test: 0.3140850	best: 0.3140676 (123)	total: 172ms

525:	learn: 0.2820224	test: 0.2691848	best: 0.2691315 (495)	total: 780ms	remaining: 702ms
550:	learn: 0.2814620	test: 0.2695056	best: 0.2691315 (495)	total: 812ms	remaining: 662ms
575:	learn: 0.2809716	test: 0.2693083	best: 0.2691315 (495)	total: 843ms	remaining: 620ms
600:	learn: 0.2804194	test: 0.2698657	best: 0.2691315 (495)	total: 872ms	remaining: 579ms
625:	learn: 0.2798310	test: 0.2696788	best: 0.2691315 (495)	total: 903ms	remaining: 540ms
650:	learn: 0.2793533	test: 0.2693249	best: 0.2691315 (495)	total: 935ms	remaining: 501ms
675:	learn: 0.2788505	test: 0.2696392	best: 0.2691315 (495)	total: 966ms	remaining: 463ms
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2691314502
bestIteration = 495

Shrink model to first 496 iterations.
--------------------------------------------------
catboost training fold 7
0:	learn: 0.6215854	test: 0.6214136	best: 0.6214136 (0)	total: 3.18ms	remaining: 3.18s
25:	learn: 0.3323266	test: 0.3501806	best: 0.3501806 (25)	total: 40.

In [11]:
train_df['pred_prob'] = 0
for method in CFG.METHOD_LIST:
    oof_df = pd.read_csv(f'oof/oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
    train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
score = roc_auc_score(train_df[CFG.target_col], train_df[f'pred_prob'])
print(f' CV:{score} model_weight_dict:{CFG.model_weight_dict}')

 CV:0.8511991064031247 model_weight_dict:{'catboost': 1.0}


In [12]:
#提出ファイルの出力 
test_df['target'] = 0 
for method in CFG.METHOD_LIST: 
    test_df['target'] += test_df[f'{method}_pred_prob']*CFG.model_weight_dict[method]

test_df['target'].to_csv(f'prediction/catboost_seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_cv{score}_submission.csv', header=False)

In [612]:
model = pickle.load(open(f'model/catboost_fold1_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
feature_importances = model.get_feature_importance()
feature_importances_df = pd.DataFrame({
    'Feature':train_df[features].columns,
    'Importance': feature_importances}).sort_values(by = 'Importance',ascending=False)
print(feature_importances_df)

non_zero_feature_importances_df = feature_importances_df[feature_importances_df['Importance']==0.0]
non_zero_feature_importances_df_list = non_zero_feature_importances_df['Feature'].tolist()
print(len(non_zero_feature_importances_df_list),non_zero_feature_importances_df_list)

                   Feature  Importance
8           ProductPitched   23.506805
13             Designation   11.907949
11                Passport    9.186766
15                 married    8.578184
14           MonthlyIncome    6.211301
0                      Age    5.708765
5                   Gender    5.366114
23            AllOfcontact    4.657936
2                 CityTier    3.679951
22         MoneyforOneTrip    3.678859
20           Income_person    3.146589
3          DurationOfPitch    2.885465
12  PitchSatisfactionScore    2.413508
10           NumberOfTrips    1.640431
7        NumberOfFollowups    1.267544
24              Income_Age    1.091965
21            Income_child    0.964254
9    PreferredPropertyStar    0.958153
1            TypeofContact    0.938893
16           car_possesion    0.754374
17               offspring    0.740323
4               Occupation    0.392481
18          family_members    0.169743
19              Child_Rate    0.153646
6   NumberOfPersonVisitin