In [4]:
# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import unicodedata
import warnings
warnings.filterwarnings('ignore')
import random
import copy
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 4
    AUTHOR = 'Naoki'
    COMPETITION = 'SC2024'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['lightgbm','catboost','xgboost']
    seed = 42
    n_folds = 7
    target_col = 'ProdTaken'
    metric = 'AUC'
    metric_maximize_flag = True
    num_boost_round = 300
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves' : 15,
        'lambda_l1' : 0.2,
        'lambda_l2' : 0.2,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'lambda':2,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'depth':2,
        'l2_leaf_reg' : 3,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    
    model_weight_dict = {'adaboost': 0.10,'lightgbm': 0.24, 'xgboost': 0.04, 'catboost': 0.72}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# AUC

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [5]:
#データの読み込み
train_df = pd.read_csv('data/train_processed.csv', index_col=0)
test_df = pd.read_csv('data/test_processed.csv', index_col=0)

#学習に必要となるリストの作成
LabelList = ['TypeofContact','car_possesion','Passport','Gender','PitchSatisfactionScore']
OneHotList = ['CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
default_categorical_features = ['TypeofContact','car_possesion','Passport','Gender','PitchSatisfactionScore',
                                'CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
default_numerical_features = ['Age','DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips',
                              'MonthlyIncome','offspring']
'''
NumericalList = ['Age','DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips','MonthlyIncome','offspring',
               'family_members','Income_person','Child_Rate', 'MoneyforOneTrip','AllOfcontact','PitchPoint','Income_child']
'''
NumericalList = ['Age','DurationOfPitch','NumberOfPersonVisiting','NumberOfFollowups','NumberOfTrips','MonthlyIncome','offspring',
               'family_members','Income_person','Child_Rate', 'MoneyforOneTrip','AllOfcontact','PitchPoint','Income_child']

def Preprocessing(train_df, test_df):
    
    def miss_dealing(train_df,test_df):
        #ラベルエンコーディング
        LabelList = ['TypeofContact','CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
        for col in LabelList:
            encoder = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0)
            encoder.fit(combined)
            train_df[col] = encoder.transform(train_df[col])
            test_df[col] = encoder.transform(test_df[col])
        features = ['Age','TypeofContact','CityTier','DurationOfPitch','Occupation','Gender','NumberOfPersonVisiting','NumberOfFollowups',
                    'ProductPitched','PreferredPropertyStar','NumberOfTrips','Passport','PitchSatisfactionScore','Designation',
                    'MonthlyIncome','married','car_possesion','offspring']
        train_x = train_df[features]
        imputer = IterativeImputer(max_iter=10, random_state=0)
        imputer.fit(train_x)
        train_x_imputed = imputer.transform(train_x)
        train_df_imputed = pd.DataFrame(train_x_imputed, columns=train_x.columns)
        train_df_imputed[CFG.target_col] = train_df[CFG.target_col]
        test_df_imputed = imputer.transform(test_df)
        test_df_imputed = pd.DataFrame(test_df_imputed, columns=test_df.columns)
        original_index = list(range(3489,6978))
        test_df_imputed.index = original_index
        return train_df_imputed,test_df_imputed
    
     #特徴量作成
    def make_features(input_df):
        df = input_df.copy()
        def count_adult_members(input_str):
            if input_str == 1.0:
                return 2
            else:
                return 1
        
        df['family_members'] = df['married'].apply(count_adult_members) + df['offspring']
        df['Child_Rate'] = df['offspring']/df['family_members']
        df['Income_person'] = df['MonthlyIncome']/df['family_members']
        df['Income_child'] =  df['MonthlyIncome']/(df['offspring']+0.0001)
        df['MoneyforOneTrip'] = df['MonthlyIncome']*12/df['NumberOfTrips']
        df['AllOfcontact'] = df['DurationOfPitch'] + df['NumberOfFollowups']*2.1
        df['PitchPoint'] = df['DurationOfPitch'] * df['PitchSatisfactionScore']

        

            


        
        return df
    
    

    def encoding(train_df,test_df):
        #ラベルエンコーディング
        for col in LabelList:
            encoder = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0)
            encoder.fit(combined)
            train_df[col] = encoder.transform(train_df[col])
            test_df[col] = encoder.transform(test_df[col])
        #ワンホットエンコーディング
        train_df2 = train_df.drop([CFG.target_col],axis=1)
        ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
        train_df2 = ohe.fit_transform(train_df2)
        test_df = ohe.transform(test_df)
        train_df = pd.concat([train_df2,train_df[CFG.target_col]],axis=1)
        return train_df, test_df
        
    train_df, test_df = miss_dealing(train_df, test_df)
    train_df = make_features(train_df)
    test_df = make_features(test_df)
    print(train_df)
    train_df.info()
    train_df, test_df = encoding(train_df, test_df)
    return train_df, test_df
    
#前処理の実行
train_df, test_df = Preprocessing(train_df,test_df)



#特徴量の指定
features = train_df.columns.tolist()
#学習に使用しない特徴量は以下で除外
RemoveList=[CFG.target_col]
for i in RemoveList:
    features.remove(i)
print(f'features for training:{features}')

#カテゴリカル特徴量の指定
categorical_features = copy.deepcopy(features)
print(NumericalList)
for i in NumericalList:
    categorical_features.remove(i)
print(categorical_features)




            Age  TypeofContact  CityTier  DurationOfPitch  Occupation  Gender  \
0     50.000000            1.0       1.0             15.0         0.0     0.0   
1     56.000000            0.0       0.0             14.0         1.0     0.0   
2     35.814031            1.0       0.0             10.0         0.0     1.0   
3     37.000000            1.0       1.0             18.0         2.0     1.0   
4     48.000000            0.0       2.0             17.0         2.0     1.0   
...         ...            ...       ...              ...         ...     ...   
3484  40.000000            1.0       1.0             26.0         1.0     0.0   
3485  40.000000            1.0       0.0              9.0         0.0     0.0   
3486  31.000000            1.0       0.0             14.0         2.0     1.0   
3487  56.000000            0.0       1.0             15.0         1.0     0.0   
3488  42.000000            1.0       0.0              9.0         2.0     0.0   

      NumberOfPersonVisitin

In [6]:
#パラメータ調整
params = {'depth':[1,2,3,4,5],
         'learning_rate':[0.01,0.02,0.03,0.05,0.7,0.1],
         'l2_leaf_reg':[3,4,5,6,7,8],
         'iterations':[300]}

ctb = cb.CatBoostClassifier(eval_metric='AUC',logging_level='Silent')
ctb_grid_search = GridSearchCV(ctb,params,scoring='roc_auc',cv=3,verbose=2,error_score='raise')
ctb_grid_search.fit(train_df[features],train_df[CFG.target_col])

print(ctb_grid_search.best_params_)
print(ctb_grid_search.best_index_)
print(ctb_grid_search.best_score_)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.01; total time=   1.3s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.01; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.01; total time=   1.2s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.02; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.02; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.02; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.03; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning_rate=0.05; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=3, learning

[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.02; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.02; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.02; total time=   1.0s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.03; total time=   1.0s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.03; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.03; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.05; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.05; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.05; total time=   1.0s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.7; total time=   1.1s
[CV] END depth=1, iterations=300, l2_leaf_reg=8, learning_rate=0.7; total time=   1.1s
[CV] END depth=1, iterations=300, 

[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.03; total time=   1.4s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.03; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.05; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.05; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.05; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.7; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.7; total time=   1.4s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.7; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.1; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.1; total time=   1.3s
[CV] END depth=2, iterations=300, l2_leaf_reg=7, learning_rate=0.1; total time=   1.3s
[CV] END depth=2, iterations=300, l2_l

[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.05; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.7; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.7; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.7; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.1; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.1; total time=   1.6s
[CV] END depth=3, iterations=300, l2_leaf_reg=6, learning_rate=0.1; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=7, learning_rate=0.01; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=7, learning_rate=0.01; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=7, learning_rate=0.01; total time=   1.5s
[CV] END depth=3, iterations=300, l2_leaf_reg=7, learning_rate=0.02; total time=   1.6s
[CV] END depth=3, iterations=300, l2_l

[CV] END depth=4, iterations=300, l2_leaf_reg=5, learning_rate=0.1; total time=   1.9s
[CV] END depth=4, iterations=300, l2_leaf_reg=5, learning_rate=0.1; total time=   2.0s
[CV] END depth=4, iterations=300, l2_leaf_reg=5, learning_rate=0.1; total time=   1.6s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.01; total time=   1.7s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.01; total time=   1.8s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.01; total time=   1.7s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.02; total time=   1.6s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.02; total time=   1.9s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.02; total time=   1.6s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.03; total time=   2.0s
[CV] END depth=4, iterations=300, l2_leaf_reg=6, learning_rate=0.03; total time=   2.2s
[CV] END depth=4, iterations=300, l

[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.01; total time=   2.5s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.01; total time=   2.3s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.02; total time=   2.4s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.02; total time=   2.5s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.02; total time=   2.2s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.03; total time=   2.3s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.03; total time=   2.5s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.03; total time=   2.3s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.05; total time=   2.3s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.05; total time=   2.6s
[CV] END depth=5, iterations=300, l2_leaf_reg=5, learning_rate=0.05; total time=   2.3s
[CV] END depth=5, iterations=300

In [10]:
kf2 = StratifiedKFold(n_splits=7,shuffle=True,random_state=11)
test_pred_list = []
auc_list = []
preds = []
for fold, (train_idx,valid_idx) in enumerate(kf2.split(train_df[features],train_df[CFG.target_col])):
    x_tr, y_tr = train_df[features].iloc[train_idx, :],train_df[CFG.target_col].iloc[train_idx]
    x_val,y_val = train_df[features].iloc[valid_idx, :],train_df[CFG.target_col].iloc[valid_idx]
    
    clf = cb.CatBoostClassifier(eval_metric='AUC',depth=1,iterations=300,l2_leaf_reg=6,learning_rate=0.1,logging_level='Silent')
    clf.fit(x_tr,y_tr)
    
    y_pred = clf.predict_proba(x_val)[:,1]
    auc = roc_auc_score(y_val, y_pred)
    test_pred = clf.predict_proba(test_df[features])[:,1]
    test_pred_list.append(test_pred)
    print(f'Fold{fold+1}:AUC = {auc:.4f}')
    auc_list.append(auc)

print(np.mean(auc_list))
last_pred = sum(test_pred_list)

Fold1:AUC = 0.8675
Fold2:AUC = 0.8747
Fold3:AUC = 0.8228
Fold4:AUC = 0.8347
Fold5:AUC = 0.8514
Fold6:AUC = 0.8406
Fold7:AUC = 0.8168
0.8440685424459912


In [11]:
test_df['target'] = 0
test_df['target'] += last_pred/7
test_df['target'].to_csv(f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission_fromAiful.csv', header=False)

In [54]:
df1 = pd.read_csv('seed42_ver1_Naoki_submission.csv', header=None,names=['Index','Prediction1'])
df2 = pd.read_csv('seed42_ver1_Naoki_submission_fromAiful.csv', header=None,names=['Index','Prediction2'])

In [55]:
df1['Prediction'] = (df1['Prediction1'] +df2['Prediction2'])/2

In [57]:
original_index = list(range(3489,6978))
df1.index = original_index
df1['Prediction1'].to_csv('df1.csv',  header=False)