In [1]:
!pip3 install scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn



[0m

In [2]:
import pandas as pd
import sklearn
import numpy as np
import random
import os
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed=42
seed_everything(seed) # Seed 고정

In [4]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [5]:
def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    'customer_idx',
    'lead_owner',

    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    


def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',
        'being followed up': 'being followed up.',
        'less than 6 months': '3 months ~ 6 months',
        '3_months_~_6_months' : '3 months ~ 6 months'
    })

    df_all.expected_timeline.fillna('NAN',inplace=True)
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    ###############################
    #detect special char and to lower
    for i in label_columns:
        df_all[i] = df_all[i].str.lower()
        df_all[i] = df_all[i].str.replace(pat=r'[^\w]', repl=r'', regex=True)
        
    #to int
    df_all['id_strategic_ver'] = df_all['id_strategic_ver'].astype('int64')
    df_all['it_strategic_ver'] = df_all['it_strategic_ver'].astype('int64')
    df_all['idit_strategic_ver'] = df_all['idit_strategic_ver'].astype('int64')
    df_all['lead_desc_length'] = df_all['lead_desc_length'].astype('int64')
    df_all['historical_existing_cnt'] = df_all['historical_existing_cnt'].astype('int64')
    
    #seperate
    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    
    #RobustScaler
    robustScaler = RobustScaler()
    df_train['ver_win_rate_x'] = robustScaler.fit_transform(df_train[['ver_win_rate_x']])
    df_test['ver_win_rate_x'] = robustScaler.transform(df_test[['ver_win_rate_x']])
    robustScaler = RobustScaler()
    df_train['ver_win_ratio_per_bu'] = robustScaler.fit_transform(df_train[['ver_win_ratio_per_bu']])
    df_test['ver_win_ratio_per_bu'] = robustScaler.transform(df_test[['ver_win_ratio_per_bu']])
    ##############################
    
    return df_train, df_test


df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = category_all(df_train, df_test)
rate = ((len(df_train)-sum(df_train['is_converted']))/sum(df_train['is_converted']))
x_train, y_train = df_train.drop("is_converted",axis=1), df_train["is_converted"]

In [6]:
categorical_features = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
   'customer_idx',
   'lead_owner',
    ]

S_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

all_train_pool = []
all_val_pool = []
all_y_val = []
for train_idx, val_idx in S_kfold.split(x_train, list(y_train)):
    tmp_x_train, tmp_x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    tmp_y_train, tmp_y_val = y_train[train_idx], y_train[val_idx]
    train_pool = Pool(data=tmp_x_train, label=tmp_y_train, cat_features=categorical_features)
    val_pool = Pool(data=tmp_x_val, label=tmp_y_val, cat_features=categorical_features)
    all_train_pool.append(train_pool)
    all_val_pool.append(val_pool)
    all_y_val.append(list(tmp_y_val))

In [10]:
def objective(trial):
    params = {'iterations':trial.suggest_int("iterations", 10000, 50000),
        'learning_rate' : trial.suggest_float('learning_rate',0.001, 1),
        'reg_lambda': trial.suggest_float('reg_lambda',10,100),
        'random_strength': trial.suggest_float('random_strength',40,100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,50),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_float('bagging_temperature', 0, 1),
             }
    
    #score = []
    tmp_score = []
    for train_pool, val_pool, y_true in zip(all_train_pool, all_val_pool, all_y_val):
        clf = CatBoostClassifier(**params,
                                 scale_pos_weight=rate, 
                                 loss_function='Logloss',
                                 eval_metric='F1',
                                 early_stopping_rounds=100,
                                 verbose=False,
                                random_state=seed,
                                 task_type="GPU",
                                 devices='0',
                                 max_bin=254,
                                 max_ctr_complexity=15
                                )
        clf.fit(train_pool, eval_set=val_pool)
        
        y_pred = clf.predict(val_pool)
        y_pred = [False if i=='False' else True for i in y_pred]
        tmp_score.append(f1_score(y_true,y_pred,labels=[True, False]))
    tmp_score = np.mean(tmp_score)
    return tmp_score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=None)

[I 2024-02-16 15:39:14,867] A new study created in memory with name: no-name-4c1a155f-0b8f-4cab-819b-a6e34b98205e
[I 2024-02-16 15:39:34,535] Trial 0 finished with value: 0.7483384199519001 and parameters: {'iterations': 24981, 'learning_rate': 0.9507635921035062, 'reg_lambda': 75.87945476302646, 'random_strength': 75.91950905182219, 'min_data_in_leaf': 8, 'leaf_estimation_iterations': 3, 'bagging_temperature': 0.05808361216819946}. Best is trial 0 with value: 0.7483384199519001.
[I 2024-02-16 15:40:01,446] Trial 1 finished with value: 0.7690482912079661 and parameters: {'iterations': 44647, 'learning_rate': 0.6015138967314656, 'reg_lambda': 73.7265320016441, 'random_strength': 41.23506965774815, 'min_data_in_leaf': 49, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.21233911067827616}. Best is trial 1 with value: 0.7690482912079661.
[I 2024-02-16 15:40:37,054] Trial 2 finished with value: 0.7697394434074869 and parameters: {'iterations': 17273, 'learning_rate': 0.1842211053

[I 2024-02-16 15:51:26,196] Trial 22 finished with value: 0.7607275809146967 and parameters: {'iterations': 42360, 'learning_rate': 0.8079546759717183, 'reg_lambda': 38.92640400265166, 'random_strength': 65.65688497906464, 'min_data_in_leaf': 19, 'leaf_estimation_iterations': 3, 'bagging_temperature': 0.21419854542017464}. Best is trial 13 with value: 0.7789647517804476.
[I 2024-02-16 15:51:54,800] Trial 23 finished with value: 0.7706975431257188 and parameters: {'iterations': 27285, 'learning_rate': 0.4826154637598834, 'reg_lambda': 18.436601843256003, 'random_strength': 81.45041274305098, 'min_data_in_leaf': 28, 'leaf_estimation_iterations': 1, 'bagging_temperature': 0.301914107979389}. Best is trial 13 with value: 0.7789647517804476.
[I 2024-02-16 15:52:18,318] Trial 24 finished with value: 0.7623062384350077 and parameters: {'iterations': 35751, 'learning_rate': 0.630224376744384, 'reg_lambda': 31.293749052488643, 'random_strength': 73.994215315079, 'min_data_in_leaf': 37, 'leaf_es

[I 2024-02-16 16:03:11,907] Trial 44 finished with value: 0.763599563004232 and parameters: {'iterations': 16968, 'learning_rate': 0.403044609149666, 'reg_lambda': 15.5786225009222, 'random_strength': 88.82264920836806, 'min_data_in_leaf': 50, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.6739335978249624}. Best is trial 38 with value: 0.785822641432165.
[I 2024-02-16 16:03:48,346] Trial 45 finished with value: 0.7760676338626424 and parameters: {'iterations': 25914, 'learning_rate': 0.2382408252934814, 'reg_lambda': 27.668171765195382, 'random_strength': 97.23454929437897, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 9, 'bagging_temperature': 0.5194613896214049}. Best is trial 38 with value: 0.785822641432165.
[I 2024-02-16 16:04:45,055] Trial 46 finished with value: 0.7607484347997759 and parameters: {'iterations': 23574, 'learning_rate': 0.07733650564531327, 'reg_lambda': 24.143063270326593, 'random_strength': 92.64988888745843, 'min_data_in_leaf': 21, 'leaf_es

[I 2024-02-16 16:16:37,046] Trial 66 finished with value: 0.7739455366007537 and parameters: {'iterations': 28713, 'learning_rate': 0.11859320684971696, 'reg_lambda': 19.676745723354784, 'random_strength': 63.55838589461722, 'min_data_in_leaf': 5, 'leaf_estimation_iterations': 8, 'bagging_temperature': 0.4034588022582052}. Best is trial 38 with value: 0.785822641432165.
[I 2024-02-16 16:17:13,096] Trial 67 finished with value: 0.7768576315088646 and parameters: {'iterations': 24613, 'learning_rate': 0.2226373062944105, 'reg_lambda': 14.24331916460639, 'random_strength': 57.38072321349491, 'min_data_in_leaf': 3, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.4501067088084593}. Best is trial 38 with value: 0.785822641432165.
[I 2024-02-16 16:18:07,925] Trial 68 finished with value: 0.7687840418102685 and parameters: {'iterations': 23004, 'learning_rate': 0.0890589467527047, 'reg_lambda': 29.016161827037962, 'random_strength': 66.05664837553414, 'min_data_in_leaf': 6, 'leaf_es

[I 2024-02-16 16:31:04,831] Trial 88 finished with value: 0.7547383730202817 and parameters: {'iterations': 21712, 'learning_rate': 0.9310858711619221, 'reg_lambda': 46.72263697159579, 'random_strength': 55.84516587478416, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.5347149265658392}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:31:41,145] Trial 89 finished with value: 0.783845036035684 and parameters: {'iterations': 23675, 'learning_rate': 0.2784189813911719, 'reg_lambda': 18.72795163341968, 'random_strength': 68.42214020636946, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.4341208862744799}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:32:29,085] Trial 90 finished with value: 0.7746747152151955 and parameters: {'iterations': 23666, 'learning_rate': 0.13574581572722588, 'reg_lambda': 19.530982961781966, 'random_strength': 69.87538072902214, 'min_data_in_leaf': 17, 'lea

[I 2024-02-16 16:44:01,863] Trial 110 finished with value: 0.7642505900314618 and parameters: {'iterations': 13238, 'learning_rate': 0.3797986102858604, 'reg_lambda': 14.458430809732755, 'random_strength': 59.7078187353013, 'min_data_in_leaf': 8, 'leaf_estimation_iterations': 15, 'bagging_temperature': 0.9180578089083132}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:44:35,836] Trial 111 finished with value: 0.7763635684675428 and parameters: {'iterations': 19137, 'learning_rate': 0.3075251558533492, 'reg_lambda': 16.479019421907083, 'random_strength': 94.24146135326095, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.47152628489667375}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:45:13,842] Trial 112 finished with value: 0.7808045224593346 and parameters: {'iterations': 18093, 'learning_rate': 0.26949454279098156, 'reg_lambda': 25.905738763076176, 'random_strength': 96.54825064389746, 'min_data_in_leaf': 17,

[I 2024-02-16 16:57:24,203] Trial 132 finished with value: 0.7670011964863672 and parameters: {'iterations': 29297, 'learning_rate': 0.21347801866845156, 'reg_lambda': 27.556050330406872, 'random_strength': 98.48817568921551, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 9, 'bagging_temperature': 0.4754894939529663}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:57:57,395] Trial 133 finished with value: 0.781396757161329 and parameters: {'iterations': 24277, 'learning_rate': 0.3585616721355533, 'reg_lambda': 25.374494501511755, 'random_strength': 94.86029731473894, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.4323270187088218}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 16:58:33,205] Trial 134 finished with value: 0.7829656716649728 and parameters: {'iterations': 23337, 'learning_rate': 0.35975413778739085, 'reg_lambda': 24.945548836889756, 'random_strength': 94.85253531785746, 'min_data_in_leaf': 16,

[I 2024-02-16 17:09:21,941] Trial 154 finished with value: 0.7788057918797359 and parameters: {'iterations': 27072, 'learning_rate': 0.3122557173620377, 'reg_lambda': 24.32037928686217, 'random_strength': 72.37724939836964, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.33013486189445274}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:09:54,770] Trial 155 finished with value: 0.775989136213888 and parameters: {'iterations': 23830, 'learning_rate': 0.2845569452396682, 'reg_lambda': 23.01387712881359, 'random_strength': 50.609486960479416, 'min_data_in_leaf': 15, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.43380366304985385}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:10:26,838] Trial 156 finished with value: 0.7817015344329723 and parameters: {'iterations': 25102, 'learning_rate': 0.34681503064467123, 'reg_lambda': 20.802904920637687, 'random_strength': 67.38836070658867, 'min_data_in_leaf': 13

[I 2024-02-16 17:20:33,256] Trial 176 finished with value: 0.7727393335428399 and parameters: {'iterations': 27010, 'learning_rate': 0.32721609412512015, 'reg_lambda': 29.512369691251347, 'random_strength': 98.12020378013823, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.45396788115399667}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:21:05,702] Trial 177 finished with value: 0.7760650903161046 and parameters: {'iterations': 23109, 'learning_rate': 0.28198397552937027, 'reg_lambda': 21.76688097786791, 'random_strength': 76.7672091547406, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.4689214568522545}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:21:32,695] Trial 178 finished with value: 0.7665621151504804 and parameters: {'iterations': 21384, 'learning_rate': 0.30466325261615984, 'reg_lambda': 15.974101150859589, 'random_strength': 95.22377233008345, 'min_data_in_leaf': 1

[I 2024-02-16 17:32:42,234] Trial 198 finished with value: 0.7810428101404236 and parameters: {'iterations': 29264, 'learning_rate': 0.32180268832077874, 'reg_lambda': 20.195170906042904, 'random_strength': 68.06613041145958, 'min_data_in_leaf': 13, 'leaf_estimation_iterations': 8, 'bagging_temperature': 0.3927201451682081}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:33:13,884] Trial 199 finished with value: 0.7760540852230448 and parameters: {'iterations': 23937, 'learning_rate': 0.3458751994266657, 'reg_lambda': 25.182635994649424, 'random_strength': 96.88715087511662, 'min_data_in_leaf': 15, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.3445359967432458}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:33:43,064] Trial 200 finished with value: 0.7714426754949817 and parameters: {'iterations': 23147, 'learning_rate': 0.2943690028878639, 'reg_lambda': 28.724009494744074, 'random_strength': 66.7608031532315, 'min_data_in_leaf': 17, 

[I 2024-02-16 17:45:28,097] Trial 220 finished with value: 0.7778715142623777 and parameters: {'iterations': 23296, 'learning_rate': 0.3036684954055384, 'reg_lambda': 11.08680256334732, 'random_strength': 92.68869430561145, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 10, 'bagging_temperature': 0.6898900719772225}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:46:05,213] Trial 221 finished with value: 0.7776806457860415 and parameters: {'iterations': 26102, 'learning_rate': 0.262780067901218, 'reg_lambda': 24.42142470887767, 'random_strength': 62.51960015599097, 'min_data_in_leaf': 17, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.6508523380652741}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:46:41,303] Trial 222 finished with value: 0.7747238328503613 and parameters: {'iterations': 23759, 'learning_rate': 0.22899106090269117, 'reg_lambda': 22.56512860505928, 'random_strength': 61.9440535458609, 'min_data_in_leaf': 18, 'le

[I 2024-02-16 17:58:24,910] Trial 242 finished with value: 0.7777053534319888 and parameters: {'iterations': 22700, 'learning_rate': 0.20318127596541635, 'reg_lambda': 14.983677378742865, 'random_strength': 93.9821110974436, 'min_data_in_leaf': 17, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.5441005794567956}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:59:00,964] Trial 243 finished with value: 0.7754953236109344 and parameters: {'iterations': 20346, 'learning_rate': 0.23761690585873568, 'reg_lambda': 17.755890841722046, 'random_strength': 95.62320900666415, 'min_data_in_leaf': 17, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.4864283944091803}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 17:59:44,115] Trial 244 finished with value: 0.7815196312251309 and parameters: {'iterations': 18940, 'learning_rate': 0.21242518884694683, 'reg_lambda': 22.426540268252452, 'random_strength': 97.09037302403154, 'min_data_in_leaf': 1

[I 2024-02-16 18:10:56,182] Trial 264 finished with value: 0.7742555318101461 and parameters: {'iterations': 22358, 'learning_rate': 0.29036246301325513, 'reg_lambda': 10.761562884534513, 'random_strength': 67.34400391275712, 'min_data_in_leaf': 9, 'leaf_estimation_iterations': 12, 'bagging_temperature': 0.49073783100253915}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:11:28,337] Trial 265 finished with value: 0.7787351209971887 and parameters: {'iterations': 24994, 'learning_rate': 0.3279808817329875, 'reg_lambda': 16.418894760025424, 'random_strength': 70.11082119200911, 'min_data_in_leaf': 15, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.3985880857557527}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:12:05,006] Trial 266 finished with value: 0.7793447315095686 and parameters: {'iterations': 20689, 'learning_rate': 0.24022260184460115, 'reg_lambda': 13.146743807756746, 'random_strength': 98.7145293869678, 'min_data_in_leaf': 16

[I 2024-02-16 18:22:43,730] Trial 286 finished with value: 0.7788860713791323 and parameters: {'iterations': 23477, 'learning_rate': 0.3208767968596584, 'reg_lambda': 16.58294352304987, 'random_strength': 97.28058744630198, 'min_data_in_leaf': 13, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.4206429606137407}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:23:13,533] Trial 287 finished with value: 0.7767204533052123 and parameters: {'iterations': 26612, 'learning_rate': 0.3412236458190727, 'reg_lambda': 24.25297063538629, 'random_strength': 91.36539906181561, 'min_data_in_leaf': 16, 'leaf_estimation_iterations': 9, 'bagging_temperature': 0.3228009252798999}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:23:44,744] Trial 288 finished with value: 0.774304554143413 and parameters: {'iterations': 22073, 'learning_rate': 0.30791303477147036, 'reg_lambda': 20.545871665074277, 'random_strength': 95.84074470885929, 'min_data_in_leaf': 19, 'l

[I 2024-02-16 18:34:38,507] Trial 308 finished with value: 0.7752210341393653 and parameters: {'iterations': 32907, 'learning_rate': 0.3076808488082788, 'reg_lambda': 36.505696361906786, 'random_strength': 62.743783971373176, 'min_data_in_leaf': 34, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.4315228026825911}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:35:10,244] Trial 309 finished with value: 0.7782302258784985 and parameters: {'iterations': 24113, 'learning_rate': 0.3475924075757327, 'reg_lambda': 17.40083115011097, 'random_strength': 61.32201673473314, 'min_data_in_leaf': 40, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.3806793035829447}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:35:40,466] Trial 310 finished with value: 0.7772200322474581 and parameters: {'iterations': 29681, 'learning_rate': 0.37929469931894905, 'reg_lambda': 19.111945945549728, 'random_strength': 70.11795872038559, 'min_data_in_leaf': 19

[I 2024-02-16 18:47:06,313] Trial 330 finished with value: 0.772267268806959 and parameters: {'iterations': 24294, 'learning_rate': 0.24620075040358969, 'reg_lambda': 26.617459065558272, 'random_strength': 63.504751733968064, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.5346360204133299}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:47:36,097] Trial 331 finished with value: 0.7730983456090349 and parameters: {'iterations': 36297, 'learning_rate': 0.2960970538293015, 'reg_lambda': 21.043738476851097, 'random_strength': 64.64923098234198, 'min_data_in_leaf': 12, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.48806397220170045}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 18:48:10,874] Trial 332 finished with value: 0.7756926416326513 and parameters: {'iterations': 27269, 'learning_rate': 0.2795287328599572, 'reg_lambda': 24.995757616062264, 'random_strength': 68.72015809593788, 'min_data_in_leaf': 1

[I 2024-02-16 18:59:33,577] Trial 352 finished with value: 0.7706905725656015 and parameters: {'iterations': 29010, 'learning_rate': 0.33350402636156584, 'reg_lambda': 23.025233592874507, 'random_strength': 67.63373818307245, 'min_data_in_leaf': 14, 'leaf_estimation_iterations': 11, 'bagging_temperature': 0.5109472437168077}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 19:00:09,216] Trial 353 finished with value: 0.7797424702965157 and parameters: {'iterations': 20037, 'learning_rate': 0.27899003616855184, 'reg_lambda': 25.736319981225268, 'random_strength': 63.602228082702524, 'min_data_in_leaf': 11, 'leaf_estimation_iterations': 13, 'bagging_temperature': 0.059460573661901425}. Best is trial 77 with value: 0.7883078826223228.
[I 2024-02-16 19:00:41,927] Trial 354 finished with value: 0.7719283199314382 and parameters: {'iterations': 25719, 'learning_rate': 0.31260278368647154, 'reg_lambda': 35.93556721137761, 'random_strength': 92.71439457869265, 'min_data_in_leaf'

KeyboardInterrupt: 

In [11]:
best_param = study.best_params
best_param

{'iterations': 27823,
 'learning_rate': 0.20454576828135052,
 'reg_lambda': 27.154801823716518,
 'random_strength': 62.12702060181847,
 'min_data_in_leaf': 12,
 'leaf_estimation_iterations': 11,
 'bagging_temperature': 0.49467104316518706}

In [15]:
best_models = []
for train_pool, val_pool in zip(all_train_pool, all_val_pool):
    best_model = CatBoostClassifier(**best_param,
                                 random_state=seed,
                                 scale_pos_weight=rate, 
                                 loss_function='Logloss',
                                 eval_metric='F1',
                                 task_type="GPU",
                                 devices='0',
                                verbose=False,
                                   early_stopping_rounds=100,
                                    max_bin=254,
                                 max_ctr_complexity=15
                                   )
    
    best_model.fit(train_pool, eval_set=val_pool)
    best_models.append(best_model)

In [16]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [17]:
train_pred = [best_model.predict_proba(df_train.drop("is_converted",axis=1)) for best_model in best_models]
y = list(df_train['is_converted'])
train_pred = np.mean(train_pred[:],axis=0)
threshold = min([num[1] if y[i] else 1 for i, num in enumerate(train_pred)])
train_y_pred = [True if i[1]>=0.5 else False for i in train_pred]
get_clf_eval(y, train_y_pred)

오차행렬:
 [[ 4609    11]
 [  788 50372]]

정확도: 0.9857
정밀도: 0.8540
재현율: 0.9976
F1: 0.9202


In [18]:
categorical_features =  [
                        "customer_country",
                        "business_subarea",
                        "business_area",
                        "business_unit",
                        "customer_type",
                        "enterprise",
                        "customer_job",
                        "inquiry_type",
                        "product_category",
                        "product_subcategory",
                        "product_modelname",
                        "customer_position",
                        "response_corporate",
                        "expected_timeline",
                        'customer_idx',
                        'lead_owner',
                    ]
# 
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pool = Pool(data=x_test, cat_features=categorical_features)


In [19]:
# 예측 수행 (soft voting)
a_pred = [best_model.predict_proba(test_pool) for best_model in best_models]
pred = np.mean(a_pred[:], axis=0)
#pred = [True if i[1]>=threshold else False for i in pred]
pred = np.argmax(pred, axis=1)
sum(pred)

1754

In [20]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred

# 제출 파일 저장
df_sub.to_csv("submission_max.csv", index=False)

In [21]:
real = pd.read_csv(r'submission(0.767).csv')
get_clf_eval(real['is_converted'], pred)

오차행렬:
 [[1693   79]
 [  61 3438]]

정확도: 0.9734
정밀도: 0.9652
재현율: 0.9554
F1: 0.9603
