In [1]:
!pip3 install scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn



[0m

In [2]:
import pandas as pd
import sklearn
import numpy as np
import random
import os
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed=42
seed_everything(seed) # Seed 고정

In [4]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [5]:
def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    'customer_idx',
    'lead_owner',

    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    


def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',
        'being followed up': 'being followed up.',
        'less than 6 months': '3 months ~ 6 months',
        '3_months_~_6_months' : '3 months ~ 6 months'
    })

    df_all.expected_timeline.fillna('NAN',inplace=True)
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    ###############################
    #detect special char and to lower
    for i in label_columns:
        df_all[i] = df_all[i].str.lower()
        df_all[i] = df_all[i].str.replace(pat=r'[^\w]', repl=r'', regex=True)
        
    #to int
    df_all['id_strategic_ver'] = df_all['id_strategic_ver'].astype('int64')
    df_all['it_strategic_ver'] = df_all['it_strategic_ver'].astype('int64')
    df_all['idit_strategic_ver'] = df_all['idit_strategic_ver'].astype('int64')
    df_all['lead_desc_length'] = df_all['lead_desc_length'].astype('int64')
    df_all['historical_existing_cnt'] = df_all['historical_existing_cnt'].astype('int64')
    
    #seperate
    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    
    #RobustScaler
    robustScaler = RobustScaler()
    df_train['ver_win_rate_x'] = robustScaler.fit_transform(df_train[['ver_win_rate_x']])
    df_test['ver_win_rate_x'] = robustScaler.transform(df_test[['ver_win_rate_x']])
    robustScaler = RobustScaler()
    df_train['ver_win_ratio_per_bu'] = robustScaler.fit_transform(df_train[['ver_win_ratio_per_bu']])
    df_test['ver_win_ratio_per_bu'] = robustScaler.transform(df_test[['ver_win_ratio_per_bu']])
    ##############################
    
    return df_train, df_test


df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = category_all(df_train, df_test)
rate = ((len(df_train)-sum(df_train['is_converted']))/sum(df_train['is_converted']))
x_train, y_train = df_train.drop("is_converted",axis=1), df_train["is_converted"]

In [6]:
categorical_features = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
   'customer_idx',
   'lead_owner',
    ]

S_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

all_train_pool = []
all_val_pool = []
all_y_val = []
for train_idx, val_idx in S_kfold.split(x_train, list(y_train)):
    tmp_x_train, tmp_x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    tmp_y_train, tmp_y_val = y_train[train_idx], y_train[val_idx]
    train_pool = Pool(data=tmp_x_train, label=tmp_y_train, cat_features=categorical_features)
    val_pool = Pool(data=tmp_x_val, label=tmp_y_val, cat_features=categorical_features)
    all_train_pool.append(train_pool)
    all_val_pool.append(val_pool)
    all_y_val.append(list(tmp_y_val))

In [7]:
def objective(trial):
    params = {'iterations':trial.suggest_int("iterations", 10000, 50000),
        'learning_rate' : trial.suggest_float('learning_rate',0.001, 1),
        'reg_lambda': trial.suggest_float('reg_lambda',10,100),
        'random_strength': trial.suggest_float('random_strength',40,100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,50),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_float('bagging_temperature', 0, 1),
              'one_hot_max_size':trial.suggest_int('one_hot_max_size',1,10),
             }
    
    #score = []
    tmp_score = []
    for train_pool, val_pool, y_true in zip(all_train_pool, all_val_pool, all_y_val):
        clf = CatBoostClassifier(**params,
                                 scale_pos_weight=rate, 
                                 loss_function='Logloss',
                                 eval_metric='F1',
                                 early_stopping_rounds=100,
                                 verbose=False,
                                random_state=seed,
                                 task_type="GPU",
                                 devices='0',
                                 
                                )
        clf.fit(train_pool, eval_set=val_pool)
        
        y_pred = clf.predict(val_pool)
        y_pred = [False if i=='False' else True for i in y_pred]
        tmp_score.append(f1_score(y_true,y_pred,labels=[True, False]))
    tmp_score = np.mean(tmp_score)
    return tmp_score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=None)

[I 2024-02-16 19:44:46,090] A new study created in memory with name: no-name-bf567ebd-17d8-4c34-8f9b-54fbaa098b17
[I 2024-02-16 19:45:07,732] Trial 0 finished with value: 0.7706675109463528 and parameters: {'iterations': 24981, 'learning_rate': 0.9507635921035062, 'reg_lambda': 75.87945476302646, 'random_strength': 75.91950905182219, 'min_data_in_leaf': 8, 'leaf_estimation_iterations': 3, 'bagging_temperature': 0.05808361216819946, 'one_hot_max_size': 9}. Best is trial 0 with value: 0.7706675109463528.
[I 2024-02-16 19:45:26,031] Trial 1 finished with value: 0.7636447433088056 and parameters: {'iterations': 34045, 'learning_rate': 0.7083645052182495, 'reg_lambda': 11.85260448662222, 'random_strength': 98.19459112971967, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 4, 'bagging_temperature': 0.18182496720710062, 'one_hot_max_size': 2}. Best is trial 0 with value: 0.7706675109463528.
[I 2024-02-16 19:45:47,722] Trial 2 finished with value: 0.7665065827936256 and parameters: {'ite

[I 2024-02-16 19:55:14,696] Trial 21 finished with value: 0.7690790517817783 and parameters: {'iterations': 35711, 'learning_rate': 0.4844323020870215, 'reg_lambda': 49.999097286613036, 'random_strength': 93.40912739671238, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.24828688471803845, 'one_hot_max_size': 6}. Best is trial 15 with value: 0.7794830800652497.
[I 2024-02-16 19:55:41,271] Trial 22 finished with value: 0.7763385433996177 and parameters: {'iterations': 44644, 'learning_rate': 0.5661587419145809, 'reg_lambda': 62.354042351975686, 'random_strength': 92.1411634518281, 'min_data_in_leaf': 19, 'leaf_estimation_iterations': 9, 'bagging_temperature': 0.3595176417317194, 'one_hot_max_size': 5}. Best is trial 15 with value: 0.7794830800652497.
[I 2024-02-16 19:56:08,138] Trial 23 finished with value: 0.7757814838171191 and parameters: {'iterations': 49295, 'learning_rate': 0.4834067841825069, 'reg_lambda': 52.88153230507832, 'random_strength': 9

[I 2024-02-16 20:06:51,168] Trial 42 finished with value: 0.7714970949178253 and parameters: {'iterations': 46258, 'learning_rate': 0.2352944240272729, 'reg_lambda': 69.8533867015491, 'random_strength': 89.20801401324123, 'min_data_in_leaf': 37, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.21248679433109266, 'one_hot_max_size': 5}. Best is trial 27 with value: 0.782860316614663.
[I 2024-02-16 20:07:20,437] Trial 43 finished with value: 0.7738022830561302 and parameters: {'iterations': 38448, 'learning_rate': 0.3216563317821338, 'reg_lambda': 72.21087477625362, 'random_strength': 95.55065457059318, 'min_data_in_leaf': 40, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.1294790466413386, 'one_hot_max_size': 7}. Best is trial 27 with value: 0.782860316614663.
[I 2024-02-16 20:07:55,983] Trial 44 finished with value: 0.7661521828945969 and parameters: {'iterations': 44267, 'learning_rate': 0.16357185289429152, 'reg_lambda': 57.27462910639068, 'random_strength': 90.

[I 2024-02-16 20:20:48,920] Trial 63 finished with value: 0.7689850937031164 and parameters: {'iterations': 43135, 'learning_rate': 0.14556040783511237, 'reg_lambda': 81.88372929436893, 'random_strength': 97.72004254357631, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.3285412125995003, 'one_hot_max_size': 9}. Best is trial 27 with value: 0.782860316614663.
[I 2024-02-16 20:21:25,940] Trial 64 finished with value: 0.7648929485572323 and parameters: {'iterations': 44951, 'learning_rate': 0.25805227081558396, 'reg_lambda': 74.06842549576646, 'random_strength': 94.38407536867626, 'min_data_in_leaf': 32, 'leaf_estimation_iterations': 1, 'bagging_temperature': 0.21366782765066203, 'one_hot_max_size': 8}. Best is trial 27 with value: 0.782860316614663.
[I 2024-02-16 20:21:57,847] Trial 65 finished with value: 0.7830489349135604 and parameters: {'iterations': 46955, 'learning_rate': 0.34890567536531836, 'reg_lambda': 45.51401491880683, 'random_strength': 8

[I 2024-02-16 20:31:14,422] Trial 84 finished with value: 0.7792545808763294 and parameters: {'iterations': 42963, 'learning_rate': 0.36994096400895177, 'reg_lambda': 43.6119185365073, 'random_strength': 92.17005893169534, 'min_data_in_leaf': 39, 'leaf_estimation_iterations': 4, 'bagging_temperature': 0.2426627328525148, 'one_hot_max_size': 10}. Best is trial 65 with value: 0.7830489349135604.
[I 2024-02-16 20:31:46,599] Trial 85 finished with value: 0.7715343389824764 and parameters: {'iterations': 47583, 'learning_rate': 0.2129861234366769, 'reg_lambda': 24.41237552102089, 'random_strength': 96.71593428418795, 'min_data_in_leaf': 31, 'leaf_estimation_iterations': 5, 'bagging_temperature': 0.029689731393968324, 'one_hot_max_size': 9}. Best is trial 65 with value: 0.7830489349135604.
[I 2024-02-16 20:32:24,496] Trial 86 finished with value: 0.7801116694088764 and parameters: {'iterations': 10190, 'learning_rate': 0.25349458945149383, 'reg_lambda': 65.45081465209253, 'random_strength': 

[I 2024-02-16 20:42:45,758] Trial 105 finished with value: 0.7814166320650154 and parameters: {'iterations': 12144, 'learning_rate': 0.29100285960610983, 'reg_lambda': 29.331520737169164, 'random_strength': 90.63438113086649, 'min_data_in_leaf': 36, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.18169985710805586, 'one_hot_max_size': 7}. Best is trial 65 with value: 0.7830489349135604.
[I 2024-02-16 20:43:12,552] Trial 106 finished with value: 0.7729689270542979 and parameters: {'iterations': 12542, 'learning_rate': 0.453391397165425, 'reg_lambda': 29.78030131977984, 'random_strength': 91.01638124033065, 'min_data_in_leaf': 34, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.18256520889700112, 'one_hot_max_size': 7}. Best is trial 65 with value: 0.7830489349135604.
[I 2024-02-16 20:43:38,635] Trial 107 finished with value: 0.7761559601575915 and parameters: {'iterations': 47141, 'learning_rate': 0.3947230478257327, 'reg_lambda': 19.747704452941583, 'random_streng

[I 2024-02-16 20:53:21,647] Trial 126 finished with value: 0.780435713745508 and parameters: {'iterations': 16787, 'learning_rate': 0.4835095639223064, 'reg_lambda': 35.76477657088772, 'random_strength': 86.61944274759729, 'min_data_in_leaf': 36, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.8067625001650762, 'one_hot_max_size': 4}. Best is trial 114 with value: 0.7841663256627658.
[I 2024-02-16 20:53:52,099] Trial 127 finished with value: 0.7714777182674079 and parameters: {'iterations': 10014, 'learning_rate': 0.32290030495727823, 'reg_lambda': 50.966676826591915, 'random_strength': 99.95119678171541, 'min_data_in_leaf': 39, 'leaf_estimation_iterations': 8, 'bagging_temperature': 0.012129019673649283, 'one_hot_max_size': 1}. Best is trial 114 with value: 0.7841663256627658.
[I 2024-02-16 20:54:22,650] Trial 128 finished with value: 0.7695226737722464 and parameters: {'iterations': 15431, 'learning_rate': 0.29045452708935443, 'reg_lambda': 30.82476436107409, 'random_stren

[I 2024-02-16 21:04:22,108] Trial 147 finished with value: 0.7696348893034722 and parameters: {'iterations': 48558, 'learning_rate': 0.22484320068920605, 'reg_lambda': 23.320387232862743, 'random_strength': 90.16411423342736, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.7431441426780121, 'one_hot_max_size': 4}. Best is trial 138 with value: 0.7845736970222855.
[I 2024-02-16 21:04:53,614] Trial 148 finished with value: 0.7815949058378397 and parameters: {'iterations': 47195, 'learning_rate': 0.28304465870017315, 'reg_lambda': 16.544996405150442, 'random_strength': 96.8041613464572, 'min_data_in_leaf': 37, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.15264807019307355, 'one_hot_max_size': 9}. Best is trial 138 with value: 0.7845736970222855.
[I 2024-02-16 21:05:27,210] Trial 149 finished with value: 0.7816645436841284 and parameters: {'iterations': 44907, 'learning_rate': 0.27438364673906235, 'reg_lambda': 17.230054302840013, 'random_st

[I 2024-02-16 21:15:25,079] Trial 168 finished with value: 0.7699819409580972 and parameters: {'iterations': 10688, 'learning_rate': 0.400083053732348, 'reg_lambda': 23.322920050119084, 'random_strength': 94.4092473770131, 'min_data_in_leaf': 43, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.12064523120710566, 'one_hot_max_size': 2}. Best is trial 138 with value: 0.7845736970222855.
[I 2024-02-16 21:15:54,203] Trial 169 finished with value: 0.7730581780219677 and parameters: {'iterations': 37743, 'learning_rate': 0.362412849196494, 'reg_lambda': 20.321670035029257, 'random_strength': 99.9650171689519, 'min_data_in_leaf': 45, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.06191956892344329, 'one_hot_max_size': 3}. Best is trial 138 with value: 0.7845736970222855.
[I 2024-02-16 21:16:22,375] Trial 170 finished with value: 0.774250238263862 and parameters: {'iterations': 45917, 'learning_rate': 0.33169377968015235, 'reg_lambda': 27.91310610059571, 'random_strength

[I 2024-02-16 21:26:00,232] Trial 189 finished with value: 0.7668769255633752 and parameters: {'iterations': 43805, 'learning_rate': 0.3067136275201253, 'reg_lambda': 99.37178010942415, 'random_strength': 96.0114762393522, 'min_data_in_leaf': 35, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.1749897321823015, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:26:28,053] Trial 190 finished with value: 0.7740021319139492 and parameters: {'iterations': 28704, 'learning_rate': 0.4240015036169938, 'reg_lambda': 56.37467045287884, 'random_strength': 91.3336873198279, 'min_data_in_leaf': 40, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.018108424538238264, 'one_hot_max_size': 5}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:26:57,136] Trial 191 finished with value: 0.7711184499145686 and parameters: {'iterations': 44438, 'learning_rate': 0.27892083440307547, 'reg_lambda': 15.596863102571522, 'random_streng

[I 2024-02-16 21:36:19,158] Trial 210 finished with value: 0.7760846476604913 and parameters: {'iterations': 49079, 'learning_rate': 0.3401609939816302, 'reg_lambda': 73.80809930415727, 'random_strength': 91.97006861290745, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 4, 'bagging_temperature': 0.085826997582416, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:36:51,543] Trial 211 finished with value: 0.7785414176323296 and parameters: {'iterations': 45396, 'learning_rate': 0.2724532043524644, 'reg_lambda': 17.487324229070577, 'random_strength': 81.85407844248687, 'min_data_in_leaf': 36, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.15742891801724268, 'one_hot_max_size': 10}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:37:22,236] Trial 212 finished with value: 0.7726135482294296 and parameters: {'iterations': 44491, 'learning_rate': 0.2932920809742731, 'reg_lambda': 19.244572726168805, 'random_streng

[I 2024-02-16 21:46:55,965] Trial 231 finished with value: 0.7807685882342783 and parameters: {'iterations': 44587, 'learning_rate': 0.2669368243055013, 'reg_lambda': 17.75390320290915, 'random_strength': 80.96948806233503, 'min_data_in_leaf': 37, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.15265410195963955, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:47:24,692] Trial 232 finished with value: 0.770436507082479 and parameters: {'iterations': 45391, 'learning_rate': 0.28345066227618615, 'reg_lambda': 15.449253655539273, 'random_strength': 83.19476638575586, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.13806576424643033, 'one_hot_max_size': 9}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:47:52,866] Trial 233 finished with value: 0.7734405353205961 and parameters: {'iterations': 43358, 'learning_rate': 0.31055009319669535, 'reg_lambda': 19.08985245196101, 'random_stre

[I 2024-02-16 21:57:45,829] Trial 252 finished with value: 0.7622950471958849 and parameters: {'iterations': 11413, 'learning_rate': 0.5858104724331825, 'reg_lambda': 18.42732351903679, 'random_strength': 91.02797908320484, 'min_data_in_leaf': 35, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.05552678010374994, 'one_hot_max_size': 10}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:58:12,659] Trial 253 finished with value: 0.7685380613271693 and parameters: {'iterations': 46650, 'learning_rate': 0.38973089851889503, 'reg_lambda': 26.438538296081752, 'random_strength': 92.85461887832746, 'min_data_in_leaf': 43, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.12122589640865103, 'one_hot_max_size': 1}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 21:58:43,194] Trial 254 finished with value: 0.7759852124403293 and parameters: {'iterations': 48230, 'learning_rate': 0.3671435990577671, 'reg_lambda': 51.84627266474433, 'random_str

[I 2024-02-16 22:08:35,816] Trial 273 finished with value: 0.7839196801469485 and parameters: {'iterations': 11793, 'learning_rate': 0.3073659051778353, 'reg_lambda': 71.61408829075414, 'random_strength': 92.52588628612175, 'min_data_in_leaf': 46, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.21563516244825573, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:09:06,226] Trial 274 finished with value: 0.7739925813759485 and parameters: {'iterations': 11198, 'learning_rate': 0.31215925844265263, 'reg_lambda': 71.70554728387165, 'random_strength': 92.64604814492532, 'min_data_in_leaf': 49, 'leaf_estimation_iterations': 8, 'bagging_temperature': 0.20977127029092654, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:09:36,701] Trial 275 finished with value: 0.7740060542542098 and parameters: {'iterations': 12315, 'learning_rate': 0.3561990564260028, 'reg_lambda': 67.87722808176882, 'random_streng

[I 2024-02-16 22:19:16,129] Trial 294 finished with value: 0.7771128687197428 and parameters: {'iterations': 12523, 'learning_rate': 0.4534585163098695, 'reg_lambda': 67.55018997029497, 'random_strength': 71.47005789027267, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.23667863316933138, 'one_hot_max_size': 7}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:19:43,862] Trial 295 finished with value: 0.7752331229257 and parameters: {'iterations': 24139, 'learning_rate': 0.38817095343526103, 'reg_lambda': 31.753772598991755, 'random_strength': 84.84753160207639, 'min_data_in_leaf': 48, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.26365590455362964, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:20:13,881] Trial 296 finished with value: 0.7695926336627495 and parameters: {'iterations': 11224, 'learning_rate': 0.358279123392465, 'reg_lambda': 71.7051187806203, 'random_strength'

[I 2024-02-16 22:29:37,248] Trial 315 finished with value: 0.7785976853300961 and parameters: {'iterations': 13919, 'learning_rate': 0.3140101462305582, 'reg_lambda': 70.05963500494265, 'random_strength': 91.14787277985111, 'min_data_in_leaf': 41, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.10417111082400293, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:30:08,626] Trial 316 finished with value: 0.7847080593798645 and parameters: {'iterations': 44121, 'learning_rate': 0.3558798864976951, 'reg_lambda': 21.914950585845737, 'random_strength': 95.03841405930997, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.21353140017225522, 'one_hot_max_size': 6}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:30:37,260] Trial 317 finished with value: 0.7716490921161586 and parameters: {'iterations': 43696, 'learning_rate': 0.29493185366361735, 'reg_lambda': 12.753510231843311, 'random_str

[I 2024-02-16 22:39:56,393] Trial 336 finished with value: 0.7850834961230954 and parameters: {'iterations': 41494, 'learning_rate': 0.3266530830484625, 'reg_lambda': 17.09276260116473, 'random_strength': 97.9721216182192, 'min_data_in_leaf': 40, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.15058749892925447, 'one_hot_max_size': 5}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:40:25,002] Trial 337 finished with value: 0.7781492102760923 and parameters: {'iterations': 39991, 'learning_rate': 0.32523963455109783, 'reg_lambda': 16.745997642667554, 'random_strength': 98.81306198653557, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.15263561538298276, 'one_hot_max_size': 5}. Best is trial 180 with value: 0.7870537764656482.
[I 2024-02-16 22:40:53,759] Trial 338 finished with value: 0.7731282115312601 and parameters: {'iterations': 40407, 'learning_rate': 0.35158116064156114, 'reg_lambda': 18.344677619675057, 'random_str

[I 2024-02-16 22:50:21,758] Trial 357 finished with value: 0.7771544221038382 and parameters: {'iterations': 14165, 'learning_rate': 0.33348476451674774, 'reg_lambda': 21.30281459997919, 'random_strength': 95.84374143333977, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.2378176922213915, 'one_hot_max_size': 3}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 22:50:53,823] Trial 358 finished with value: 0.7821762357561914 and parameters: {'iterations': 13132, 'learning_rate': 0.38210706134996464, 'reg_lambda': 27.753188749214125, 'random_strength': 99.14031773256217, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.15004656613055525, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 22:51:23,595] Trial 359 finished with value: 0.7743010536679785 and parameters: {'iterations': 10768, 'learning_rate': 0.35411744097543607, 'reg_lambda': 25.738208431989833, 'random_st

[I 2024-02-16 23:01:00,750] Trial 378 finished with value: 0.7740035896225332 and parameters: {'iterations': 44318, 'learning_rate': 0.4235253637841984, 'reg_lambda': 13.451801559851939, 'random_strength': 55.79175335818753, 'min_data_in_leaf': 39, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.1851780602659578, 'one_hot_max_size': 6}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:01:29,237] Trial 379 finished with value: 0.7746495032087746 and parameters: {'iterations': 42849, 'learning_rate': 0.3643409082031719, 'reg_lambda': 21.387618487220603, 'random_strength': 96.450560288132, 'min_data_in_leaf': 42, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.12968408733055692, 'one_hot_max_size': 6}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:01:56,152] Trial 380 finished with value: 0.7776060587944125 and parameters: {'iterations': 40654, 'learning_rate': 0.39214438849942135, 'reg_lambda': 16.954937418183214, 'random_stren

[I 2024-02-16 23:11:31,098] Trial 399 finished with value: 0.7741524703948821 and parameters: {'iterations': 11707, 'learning_rate': 0.3685361541874605, 'reg_lambda': 40.8171422881791, 'random_strength': 99.01111088297742, 'min_data_in_leaf': 33, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.09807974126222352, 'one_hot_max_size': 7}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:12:04,770] Trial 400 finished with value: 0.7831941247018882 and parameters: {'iterations': 32000, 'learning_rate': 0.31340321731016063, 'reg_lambda': 20.920543228258012, 'random_strength': 97.24626782515188, 'min_data_in_leaf': 34, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.15838166275329738, 'one_hot_max_size': 3}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:12:35,422] Trial 401 finished with value: 0.7797181724948348 and parameters: {'iterations': 30555, 'learning_rate': 0.3045657331544735, 'reg_lambda': 20.448909219036345, 'random_stre

[I 2024-02-16 23:22:08,762] Trial 420 finished with value: 0.7743840326391702 and parameters: {'iterations': 12217, 'learning_rate': 0.3425204637062674, 'reg_lambda': 18.00536307376704, 'random_strength': 94.03002656930879, 'min_data_in_leaf': 36, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.2366719868916272, 'one_hot_max_size': 6}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:22:41,364] Trial 421 finished with value: 0.7814683672091556 and parameters: {'iterations': 43518, 'learning_rate': 0.30245364175671685, 'reg_lambda': 22.67750619954064, 'random_strength': 96.34913774044396, 'min_data_in_leaf': 35, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.45745954735499084, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:23:08,655] Trial 422 finished with value: 0.774679222182807 and parameters: {'iterations': 41639, 'learning_rate': 0.35963047732510683, 'reg_lambda': 19.27144198226467, 'random_streng

[I 2024-02-16 23:32:20,287] Trial 441 finished with value: 0.7736167990638472 and parameters: {'iterations': 14493, 'learning_rate': 0.38523254703122045, 'reg_lambda': 34.70221798082079, 'random_strength': 89.52777822144421, 'min_data_in_leaf': 44, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.07557675855654854, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:32:51,928] Trial 442 finished with value: 0.7789892203458828 and parameters: {'iterations': 16798, 'learning_rate': 0.32630632199031046, 'reg_lambda': 28.700021498077948, 'random_strength': 88.77395207163079, 'min_data_in_leaf': 46, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.1204593049611649, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:33:20,431] Trial 443 finished with value: 0.7782101219949698 and parameters: {'iterations': 27995, 'learning_rate': 0.4839134276585, 'reg_lambda': 32.346019438835754, 'random_streng

[I 2024-02-16 23:43:10,499] Trial 462 finished with value: 0.7808902774227644 and parameters: {'iterations': 42112, 'learning_rate': 0.3106147277170381, 'reg_lambda': 12.22056777457418, 'random_strength': 95.55526442869673, 'min_data_in_leaf': 46, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.4234603059971702, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:43:40,398] Trial 463 finished with value: 0.7747738222329543 and parameters: {'iterations': 42424, 'learning_rate': 0.3688830127188443, 'reg_lambda': 30.091092444861818, 'random_strength': 78.45172932537419, 'min_data_in_leaf': 45, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.6841517969712203, 'one_hot_max_size': 7}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:44:14,037] Trial 464 finished with value: 0.7759364295168728 and parameters: {'iterations': 39735, 'learning_rate': 0.2610179592719025, 'reg_lambda': 17.66464200533609, 'random_strengt

[I 2024-02-16 23:53:46,694] Trial 483 finished with value: 0.7793260352508253 and parameters: {'iterations': 41447, 'learning_rate': 0.31543149516845753, 'reg_lambda': 16.079533690971974, 'random_strength': 93.98057984105307, 'min_data_in_leaf': 39, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.215704924265498, 'one_hot_max_size': 5}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:54:15,947] Trial 484 finished with value: 0.7759069922875844 and parameters: {'iterations': 18485, 'learning_rate': 0.35050793084720094, 'reg_lambda': 17.34615195407127, 'random_strength': 95.77270130734266, 'min_data_in_leaf': 38, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.3284816405868943, 'one_hot_max_size': 6}. Best is trial 348 with value: 0.7883235810800945.
[I 2024-02-16 23:54:49,534] Trial 485 finished with value: 0.7736307006568618 and parameters: {'iterations': 24621, 'learning_rate': 0.2276496896225486, 'reg_lambda': 36.324398108793766, 'random_stren

[W 2024-02-17 00:02:23,084] Trial 500 failed with value None.


KeyboardInterrupt: 

In [8]:
best_param = study.best_params
best_param

{'iterations': 15350,
 'learning_rate': 0.3409005979799443,
 'reg_lambda': 19.507665859119093,
 'random_strength': 98.49018453709503,
 'min_data_in_leaf': 41,
 'leaf_estimation_iterations': 13,
 'bagging_temperature': 0.22209159086426983,
 'one_hot_max_size': 4}

In [9]:
best_models = []
for train_pool, val_pool in zip(all_train_pool, all_val_pool):
    best_model = CatBoostClassifier(**best_param,
                                 random_state=seed,
                                 scale_pos_weight=rate, 
                                 loss_function='Logloss',
                                 eval_metric='F1',
                                 task_type="GPU",
                                 devices='0',
                                verbose=False,
                                   early_stopping_rounds=100)
    
    best_model.fit(train_pool, eval_set=val_pool)
    best_models.append(best_model)

In [10]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [11]:
train_pred = [best_model.predict_proba(df_train.drop("is_converted",axis=1)) for best_model in best_models]
y = list(df_train['is_converted'])
train_pred = np.mean(train_pred[:],axis=0)
threshold = min([num[1] if y[i] else 1 for i, num in enumerate(train_pred)])
train_y_pred = [True if i[1]>=0.5 else False for i in train_pred]
get_clf_eval(y, train_y_pred)

오차행렬:
 [[ 4612     8]
 [  604 50556]]

정확도: 0.9890
정밀도: 0.8842
재현율: 0.9983
F1: 0.9378


In [12]:
categorical_features =  [
                        "customer_country",
                        "business_subarea",
                        "business_area",
                        "business_unit",
                        "customer_type",
                        "enterprise",
                        "customer_job",
                        "inquiry_type",
                        "product_category",
                        "product_subcategory",
                        "product_modelname",
                        "customer_position",
                        "response_corporate",
                        "expected_timeline",
                        'customer_idx',
                        'lead_owner',
                        'lead_owner',
                    ]
# 
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pool = Pool(data=x_test, cat_features=categorical_features)


In [13]:
# 예측 수행 (soft voting)
a_pred = [best_model.predict_proba(test_pool) for best_model in best_models]
pred = np.mean(a_pred[:], axis=0)
#pred = [True if i[1]>=threshold else False for i in pred]
pred = np.argmax(pred, axis=1)
sum(pred)

1669

In [14]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred

# 제출 파일 저장
df_sub.to_csv("submission-onehot.csv", index=False)

In [15]:
real = pd.read_csv(r'submission(0.767).csv')
get_clf_eval(real['is_converted'], pred)

오차행렬:
 [[1635  137]
 [  34 3465]]

정확도: 0.9676
정밀도: 0.9796
재현율: 0.9227
F1: 0.9503
