In [261]:
!pip3 install lightgbm scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn
# 모델을 위해 필요한 라이브러리 설치(우수 예시에만 있는 부분)



[0m

In [262]:
k_fold_num = 5
seed=42

In [263]:
import pandas as pd
import sklearn
import numpy as np
from lightgbm import LGBMClassifier
import random
import os
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
import warnings
warnings.filterwarnings('ignore')


In [264]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(seed) # Seed 고정

In [265]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [266]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

def manage_label_encoding(df_train, df_test):
    tmp = len(df_train)
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

    for col in label_columns:
        df_all[col] = label_encoding(df_all[col])
    
    for col in label_columns:  
        df_train[col] = df_all.iloc[: tmp][col]
        df_test[col] = df_all.iloc[tmp :][col]

    return df_train, df_test


def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    
        
    
def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',  # merge into 'less than 3 months'
        'being followed up': 'being followed up.',  # merge into 'being followed up.'
        'less than 6 months': '3 months ~ 6 months',  # merge into '3 months ~ 6 months'
        '3_months_~_6_months' : '3 months ~ 6 months'
    })
    df_train['expected_timeline'] = df_train['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',  # merge into 'less than 3 months'
        'being followed up': 'being followed up.',  # merge into 'being followed up.'
        'less than 6 months': '3 months ~ 6 months',  # merge into '3 months ~ 6 months'
        '3_months_~_6_months' : '3 months ~ 6 months'
    })
    
    top_expected_timelines = df_train['expected_timeline'].value_counts().nlargest(6).index
    df_all['expected_timeline'] = df_all['expected_timeline'].apply(lambda x: x if x in top_expected_timelines else 'Others')
    
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    return df_train, df_test
df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = manage_label_encoding(df_train,df_test)

In [267]:
rate = ((len(df_train)-sum(df_train['is_converted']))/sum(df_train['is_converted']))
def custom_split(df_train, num=5, idx=0, smote=False, false_rate=1):
    global rate

    #shuffle
    true_df_train = sklearn.utils.shuffle(df_train[df_train["is_converted"]==True], random_state=seed)
    false_df_train = sklearn.utils.shuffle(df_train[df_train["is_converted"]==False], random_state=seed)
    
    #k-fold num
    slice_num = len(true_df_train) // num
    
    #True:False = 1:1
    tmp_train = pd.concat([true_df_train[0:slice_num*idx],
                           true_df_train[slice_num*(idx+1):],
                           false_df_train[0:int((slice_num*false_rate)*idx)],
                           false_df_train[int((slice_num*false_rate)*(idx+1)):]])
    tmp_val = pd.concat([true_df_train[slice_num*idx:slice_num*(idx+1)],
                         false_df_train[int((slice_num*false_rate)*idx):int((slice_num*false_rate)*(idx+1))]])
    #shuffle
    tmp_train = sklearn.utils.shuffle(tmp_train, random_state=seed)
    tmp_val = sklearn.utils.shuffle(tmp_val, random_state=seed)

    #split x and y
    x_train, y_train = tmp_train.drop("is_converted",axis=1).fillna(0), tmp_train["is_converted"]
    x_val, y_val = tmp_val.drop("is_converted",axis=1).fillna(0), tmp_val["is_converted"]
    
    if(smote):
        sm = SMOTE(random_state=seed)
        x_train, y_train = sm.fit_resample(x_train, y_train)
    
    return x_train, y_train, x_val, y_val


all_x_trains = []
all_y_trains = []
all_x_vals = []
all_y_vals = []
for idx in range(k_fold_num):
    bundle = custom_split(df_train, num=k_fold_num, idx=idx, smote=False, false_rate=rate)
    all_x_trains.append(bundle[0])
    all_y_trains.append(bundle[1])
    all_x_vals.append(bundle[2])
    all_y_vals.append(bundle[3])

In [241]:
# params = {
#         'num_leaves': trial.suggest_int('num_leaves', 20,40), 
#         'max_depth': -1,
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), 
#         'n_estimators': trial.suggest_int('n_estimators', 50,200), 
#         'scale_pos_weight':rate,
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 80), 
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
#         'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
#         'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
#         'objective':'binary',
#         'random_state': seed,
#         'type':'gpu',
#         'verbose':-1,
#         'n_jobs':-1,
#     }

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20,40), 
        'max_depth': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), 
        'n_estimators': trial.suggest_int('n_estimators', 50,200), 
        'scale_pos_weight':rate,
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 80), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'objective':'binary',
        'random_state': seed,
        'type':'gpu',
        'verbose':-1,
        'n_jobs':-1,
    }
    
    score = []
    for all_x_train, all_y_train, all_x_val, all_y_val in zip(all_x_trains, all_y_trains, all_x_vals, all_y_vals):
        clf = LGBMClassifier(**params)
        clf.fit(all_x_train, all_y_train)
        
        y_pred = clf.predict(all_x_val)
        y_true = all_y_val
        score.append((f1_score(y_true,y_pred,labels=[True,False]) + recall_score(y_true,y_pred))/2)
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=None)

[I 2024-02-07 19:32:30,143] A new study created in memory with name: no-name-7606cd3f-edf6-4ed4-979a-13f590276c35
[I 2024-02-07 19:32:31,362] Trial 0 finished with value: 0.8274774594024914 and parameters: {'num_leaves': 27, 'learning_rate': 0.0951207163345817, 'n_estimators': 160, 'min_child_samples': 50, 'colsample_bytree': 0.7468055921327309, 'reg_alpha': 0.15599452033620265, 'reg_lambda': 0.5808361216819946}. Best is trial 0 with value: 0.8274774594024914.
[I 2024-02-07 19:32:32,618] Trial 1 finished with value: 0.8222501925021044 and parameters: {'num_leaves': 38, 'learning_rate': 0.06051038616257767, 'n_estimators': 156, 'min_child_samples': 6, 'colsample_bytree': 0.9909729556485982, 'reg_alpha': 0.8324426408004217, 'reg_lambda': 2.1233911067827616}. Best is trial 0 with value: 0.8274774594024914.
[I 2024-02-07 19:32:33,382] Trial 2 finished with value: 0.7692359196854787 and parameters: {'num_leaves': 23, 'learning_rate': 0.01915704647548995, 'n_estimators': 95, 'min_child_sampl

[I 2024-02-07 19:33:01,601] Trial 23 finished with value: 0.8334546215952425 and parameters: {'num_leaves': 37, 'learning_rate': 0.0996263130561193, 'n_estimators': 168, 'min_child_samples': 56, 'colsample_bytree': 0.7401439072496075, 'reg_alpha': 0.6980966400054012, 'reg_lambda': 3.947312295189467}. Best is trial 22 with value: 0.8342217991514131.
[I 2024-02-07 19:33:02,926] Trial 24 finished with value: 0.8177015272089934 and parameters: {'num_leaves': 38, 'learning_rate': 0.04751239301678558, 'n_estimators': 150, 'min_child_samples': 61, 'colsample_bytree': 0.8556091826468576, 'reg_alpha': 0.8523599597695553, 'reg_lambda': 3.0045620412859497}. Best is trial 22 with value: 0.8342217991514131.
[I 2024-02-07 19:33:04,520] Trial 25 finished with value: 0.8298653764606787 and parameters: {'num_leaves': 39, 'learning_rate': 0.06487672736026631, 'n_estimators': 169, 'min_child_samples': 49, 'colsample_bytree': 0.7987829060845949, 'reg_alpha': 0.6408941435366401, 'reg_lambda': 1.15715448077

[I 2024-02-07 19:33:37,499] Trial 47 finished with value: 0.841587748951589 and parameters: {'num_leaves': 40, 'learning_rate': 0.0938351135048438, 'n_estimators': 192, 'min_child_samples': 23, 'colsample_bytree': 0.7136533199436713, 'reg_alpha': 0.8271918736528732, 'reg_lambda': 0.051896066649379555}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:33:39,528] Trial 48 finished with value: 0.7928310185101454 and parameters: {'num_leaves': 40, 'learning_rate': 0.009846005190651805, 'n_estimators': 193, 'min_child_samples': 23, 'colsample_bytree': 0.7083574843399362, 'reg_alpha': 0.999573833946649, 'reg_lambda': 0.02684344386975024}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:33:40,998] Trial 49 finished with value: 0.8403350692624618 and parameters: {'num_leaves': 36, 'learning_rate': 0.09559572686395684, 'n_estimators': 192, 'min_child_samples': 5, 'colsample_bytree': 0.7128715420060665, 'reg_alpha': 0.8353069060023212, 'reg_lambda': 0.6190548940188

[I 2024-02-07 19:34:12,107] Trial 71 finished with value: 0.8392822110150202 and parameters: {'num_leaves': 40, 'learning_rate': 0.09454276070344991, 'n_estimators': 189, 'min_child_samples': 16, 'colsample_bytree': 0.7420892620157201, 'reg_alpha': 0.9990185216343221, 'reg_lambda': 0.0035722531393971857}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:34:13,640] Trial 72 finished with value: 0.8388560379072099 and parameters: {'num_leaves': 40, 'learning_rate': 0.09078093949324598, 'n_estimators': 189, 'min_child_samples': 23, 'colsample_bytree': 0.7727730806674553, 'reg_alpha': 0.9738648208854954, 'reg_lambda': 0.043662748277107574}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:34:15,131] Trial 73 finished with value: 0.8422759321960724 and parameters: {'num_leaves': 38, 'learning_rate': 0.09950263582743463, 'n_estimators': 199, 'min_child_samples': 26, 'colsample_bytree': 0.7438424364197337, 'reg_alpha': 0.9874522478050064, 'reg_lambda': 0.93649023

[I 2024-02-07 19:34:47,611] Trial 95 finished with value: 0.8395570993985855 and parameters: {'num_leaves': 38, 'learning_rate': 0.09149586216485707, 'n_estimators': 192, 'min_child_samples': 5, 'colsample_bytree': 0.706755119549911, 'reg_alpha': 0.9808428389435637, 'reg_lambda': 0.49487531001170115}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:34:49,078] Trial 96 finished with value: 0.8397050087853524 and parameters: {'num_leaves': 37, 'learning_rate': 0.09997764169009228, 'n_estimators': 197, 'min_child_samples': 13, 'colsample_bytree': 0.7462320726523681, 'reg_alpha': 0.9267371257609474, 'reg_lambda': 0.9115924212773289}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:34:50,600] Trial 97 finished with value: 0.8305047773597669 and parameters: {'num_leaves': 38, 'learning_rate': 0.05359948827465781, 'n_estimators': 178, 'min_child_samples': 10, 'colsample_bytree': 0.7179081338196742, 'reg_alpha': 0.8200662585171203, 'reg_lambda': 0.75264306548216

[I 2024-02-07 19:35:25,319] Trial 119 finished with value: 0.8370114187617448 and parameters: {'num_leaves': 37, 'learning_rate': 0.09386409577540292, 'n_estimators': 193, 'min_child_samples': 23, 'colsample_bytree': 0.761791325953788, 'reg_alpha': 0.878998273134457, 'reg_lambda': 0.43533907873993494}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:35:26,796] Trial 120 finished with value: 0.8389406499010722 and parameters: {'num_leaves': 39, 'learning_rate': 0.0913029037666722, 'n_estimators': 190, 'min_child_samples': 13, 'colsample_bytree': 0.7398339870891124, 'reg_alpha': 0.9989385986234911, 'reg_lambda': 0.012314830529804963}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:35:28,308] Trial 121 finished with value: 0.8417309107613195 and parameters: {'num_leaves': 36, 'learning_rate': 0.09657410196335434, 'n_estimators': 197, 'min_child_samples': 6, 'colsample_bytree': 0.7147730955937667, 'reg_alpha': 0.9668907636942033, 'reg_lambda': 0.96206294253

[I 2024-02-07 19:36:00,716] Trial 143 finished with value: 0.8374588388499298 and parameters: {'num_leaves': 38, 'learning_rate': 0.09068474993766444, 'n_estimators': 194, 'min_child_samples': 7, 'colsample_bytree': 0.7493779585511734, 'reg_alpha': 0.24737245322400603, 'reg_lambda': 1.0182804229632714}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:36:02,224] Trial 144 finished with value: 0.8375993894503446 and parameters: {'num_leaves': 37, 'learning_rate': 0.09418880436547988, 'n_estimators': 185, 'min_child_samples': 14, 'colsample_bytree': 0.741060969632804, 'reg_alpha': 0.9419979743822123, 'reg_lambda': 2.2724712335976873}. Best is trial 44 with value: 0.84272449135544.
[I 2024-02-07 19:36:03,768] Trial 145 finished with value: 0.8384760487780764 and parameters: {'num_leaves': 40, 'learning_rate': 0.09651382032228707, 'n_estimators': 197, 'min_child_samples': 10, 'colsample_bytree': 0.727637160496846, 'reg_alpha': 0.9975596561783374, 'reg_lambda': 0.311013480962

[I 2024-02-07 19:36:38,042] Trial 167 finished with value: 0.8402231494843146 and parameters: {'num_leaves': 39, 'learning_rate': 0.09996539534155678, 'n_estimators': 193, 'min_child_samples': 16, 'colsample_bytree': 0.7141969002969443, 'reg_alpha': 0.9106330672950746, 'reg_lambda': 0.8636648363600091}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:36:39,572] Trial 168 finished with value: 0.8408890457502183 and parameters: {'num_leaves': 40, 'learning_rate': 0.09732438067052872, 'n_estimators': 185, 'min_child_samples': 14, 'colsample_bytree': 0.7085437077857736, 'reg_alpha': 0.956620677318449, 'reg_lambda': 1.1693831568626742}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:36:40,902] Trial 169 finished with value: 0.8376383521447839 and parameters: {'num_leaves': 39, 'learning_rate': 0.09359455057010425, 'n_estimators': 181, 'min_child_samples': 15, 'colsample_bytree': 0.706823332241189, 'reg_alpha': 0.9598615559558467, 'reg_lambda': 1.044986

[I 2024-02-07 19:37:16,033] Trial 191 finished with value: 0.8391321339296545 and parameters: {'num_leaves': 38, 'learning_rate': 0.09583113024938811, 'n_estimators': 190, 'min_child_samples': 19, 'colsample_bytree': 0.7202722488434423, 'reg_alpha': 0.9479699834263362, 'reg_lambda': 0.5274810791213237}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:37:17,681] Trial 192 finished with value: 0.8425059992120542 and parameters: {'num_leaves': 39, 'learning_rate': 0.09994725895959351, 'n_estimators': 197, 'min_child_samples': 12, 'colsample_bytree': 0.7276813726873107, 'reg_alpha': 0.41566657842218313, 'reg_lambda': 0.3479824479732311}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:37:18,829] Trial 193 finished with value: 0.8334374780141391 and parameters: {'num_leaves': 39, 'learning_rate': 0.09825825482217708, 'n_estimators': 146, 'min_child_samples': 13, 'colsample_bytree': 0.730526834716509, 'reg_alpha': 0.4270012164550897, 'reg_lambda': 0.3533

[I 2024-02-07 19:37:51,295] Trial 215 finished with value: 0.8332297204176818 and parameters: {'num_leaves': 38, 'learning_rate': 0.06332838469484281, 'n_estimators': 184, 'min_child_samples': 13, 'colsample_bytree': 0.7267618972919524, 'reg_alpha': 0.9755307984459485, 'reg_lambda': 0.008082318441187036}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:37:52,611] Trial 216 finished with value: 0.8348484852386365 and parameters: {'num_leaves': 30, 'learning_rate': 0.09852491346703497, 'n_estimators': 195, 'min_child_samples': 10, 'colsample_bytree': 0.7192302433497181, 'reg_alpha': 0.35580631261083584, 'reg_lambda': 0.6829575354148958}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:37:54,571] Trial 217 finished with value: 0.8412473199906912 and parameters: {'num_leaves': 39, 'learning_rate': 0.09321409943867057, 'n_estimators': 192, 'min_child_samples': 27, 'colsample_bytree': 0.7102892987241893, 'reg_alpha': 0.5307350812970748, 'reg_lambda': 0.2

[I 2024-02-07 19:38:26,451] Trial 239 finished with value: 0.8389962927348978 and parameters: {'num_leaves': 39, 'learning_rate': 0.08620002148022185, 'n_estimators': 197, 'min_child_samples': 26, 'colsample_bytree': 0.7204893069110768, 'reg_alpha': 0.8586085787286228, 'reg_lambda': 0.01832052895318942}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:38:27,988] Trial 240 finished with value: 0.838752583952312 and parameters: {'num_leaves': 39, 'learning_rate': 0.09070129704605338, 'n_estimators': 193, 'min_child_samples': 30, 'colsample_bytree': 0.917263027075369, 'reg_alpha': 0.8934853049138023, 'reg_lambda': 0.3780416002826814}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:38:29,519] Trial 241 finished with value: 0.8415165530908719 and parameters: {'num_leaves': 38, 'learning_rate': 0.09415644917099143, 'n_estimators': 189, 'min_child_samples': 28, 'colsample_bytree': 0.7142614123330485, 'reg_alpha': 0.8662846345978282, 'reg_lambda': 0.54462

[I 2024-02-07 19:39:02,937] Trial 263 finished with value: 0.8358331063323347 and parameters: {'num_leaves': 39, 'learning_rate': 0.09642958614792392, 'n_estimators': 184, 'min_child_samples': 47, 'colsample_bytree': 0.8048244663829056, 'reg_alpha': 0.9996977121888848, 'reg_lambda': 0.2744906234240697}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:39:04,468] Trial 264 finished with value: 0.8391587165533775 and parameters: {'num_leaves': 38, 'learning_rate': 0.09100445670951182, 'n_estimators': 198, 'min_child_samples': 27, 'colsample_bytree': 0.7134849844794818, 'reg_alpha': 0.843436033233521, 'reg_lambda': 0.006221965027997933}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:39:06,040] Trial 265 finished with value: 0.8354495577946499 and parameters: {'num_leaves': 38, 'learning_rate': 0.06814798493384669, 'n_estimators': 194, 'min_child_samples': 63, 'colsample_bytree': 0.7172372985554725, 'reg_alpha': 0.43925266842390087, 'reg_lambda': 0.70

[I 2024-02-07 19:39:38,792] Trial 287 finished with value: 0.8392532135907225 and parameters: {'num_leaves': 40, 'learning_rate': 0.08787690630819713, 'n_estimators': 192, 'min_child_samples': 18, 'colsample_bytree': 0.7241598088734605, 'reg_alpha': 0.47504993545272184, 'reg_lambda': 0.16233660267932326}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:39:40,315] Trial 288 finished with value: 0.8372777293967436 and parameters: {'num_leaves': 40, 'learning_rate': 0.09235067924187312, 'n_estimators': 185, 'min_child_samples': 14, 'colsample_bytree': 0.8690659512629741, 'reg_alpha': 0.7950835215044604, 'reg_lambda': 0.2172857364288563}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:39:41,582] Trial 289 finished with value: 0.8364851286891322 and parameters: {'num_leaves': 36, 'learning_rate': 0.09183589908740052, 'n_estimators': 164, 'min_child_samples': 12, 'colsample_bytree': 0.7380106292718273, 'reg_alpha': 0.859239730431566, 'reg_lambda': 0.317

[I 2024-02-07 19:40:14,767] Trial 311 finished with value: 0.8392101398231443 and parameters: {'num_leaves': 40, 'learning_rate': 0.09968741806425735, 'n_estimators': 198, 'min_child_samples': 34, 'colsample_bytree': 0.7455933623349053, 'reg_alpha': 0.2783638555144615, 'reg_lambda': 0.013719353419856112}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:40:16,317] Trial 312 finished with value: 0.8376408317056953 and parameters: {'num_leaves': 40, 'learning_rate': 0.0969648063599541, 'n_estimators': 200, 'min_child_samples': 44, 'colsample_bytree': 0.7170862236140807, 'reg_alpha': 0.21067153927072022, 'reg_lambda': 0.4590044504508374}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:40:17,110] Trial 313 finished with value: 0.8257748771019463 and parameters: {'num_leaves': 40, 'learning_rate': 0.09671506520790239, 'n_estimators': 94, 'min_child_samples': 33, 'colsample_bytree': 0.7337281227099363, 'reg_alpha': 0.2632564403455743, 'reg_lambda': 0.176

[I 2024-02-07 19:40:52,554] Trial 335 finished with value: 0.8385218784223856 and parameters: {'num_leaves': 39, 'learning_rate': 0.09553552753099305, 'n_estimators': 196, 'min_child_samples': 18, 'colsample_bytree': 0.7170752216498854, 'reg_alpha': 0.2330721356469065, 'reg_lambda': 0.3879425440050244}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:40:54,300] Trial 336 finished with value: 0.8376719707488116 and parameters: {'num_leaves': 40, 'learning_rate': 0.098232546050673, 'n_estimators': 200, 'min_child_samples': 23, 'colsample_bytree': 0.7323577953206983, 'reg_alpha': 0.26113083538497184, 'reg_lambda': 0.19300997730205383}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:40:55,791] Trial 337 finished with value: 0.8373472143720416 and parameters: {'num_leaves': 38, 'learning_rate': 0.09982234066766629, 'n_estimators': 193, 'min_child_samples': 68, 'colsample_bytree': 0.7233736254539299, 'reg_alpha': 0.219473406148152, 'reg_lambda': 0.70977

[I 2024-02-07 19:41:28,552] Trial 359 finished with value: 0.8393901827162302 and parameters: {'num_leaves': 37, 'learning_rate': 0.09793909185142635, 'n_estimators': 197, 'min_child_samples': 17, 'colsample_bytree': 0.7082595031811232, 'reg_alpha': 0.9767783700715377, 'reg_lambda': 0.5577911273718444}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:41:29,604] Trial 360 finished with value: 0.8354173487730027 and parameters: {'num_leaves': 40, 'learning_rate': 0.0946830743991245, 'n_estimators': 137, 'min_child_samples': 14, 'colsample_bytree': 0.7204493964424795, 'reg_alpha': 0.9993610907786192, 'reg_lambda': 0.016310645386495518}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:41:31,012] Trial 361 finished with value: 0.8340293071935274 and parameters: {'num_leaves': 30, 'learning_rate': 0.0962416631701923, 'n_estimators': 197, 'min_child_samples': 11, 'colsample_bytree': 0.7276569749769414, 'reg_alpha': 0.945376607406027, 'reg_lambda': 4.23733

[I 2024-02-07 19:42:05,208] Trial 383 finished with value: 0.8327306801309511 and parameters: {'num_leaves': 37, 'learning_rate': 0.0943318692287303, 'n_estimators': 161, 'min_child_samples': 13, 'colsample_bytree': 0.8347529002363205, 'reg_alpha': 0.9367478502124491, 'reg_lambda': 0.7482722675616587}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:42:06,727] Trial 384 finished with value: 0.8396683119716689 and parameters: {'num_leaves': 39, 'learning_rate': 0.09987564554980148, 'n_estimators': 192, 'min_child_samples': 9, 'colsample_bytree': 0.715816985691191, 'reg_alpha': 0.3005033991208481, 'reg_lambda': 0.3297854525954159}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:42:08,263] Trial 385 finished with value: 0.8404298355886717 and parameters: {'num_leaves': 40, 'learning_rate': 0.09696173485888078, 'n_estimators': 197, 'min_child_samples': 15, 'colsample_bytree': 0.7224579406551511, 'reg_alpha': 0.9838953457574189, 'reg_lambda': 0.4744741

[I 2024-02-07 19:42:40,454] Trial 407 finished with value: 0.8406653083414183 and parameters: {'num_leaves': 38, 'learning_rate': 0.09238212492596991, 'n_estimators': 200, 'min_child_samples': 26, 'colsample_bytree': 0.7320941886649887, 'reg_alpha': 0.348715781166263, 'reg_lambda': 0.42181694822735916}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:42:41,918] Trial 408 finished with value: 0.8387414526673966 and parameters: {'num_leaves': 37, 'learning_rate': 0.09631335990819088, 'n_estimators': 189, 'min_child_samples': 16, 'colsample_bytree': 0.7218224000828833, 'reg_alpha': 0.28240369212783084, 'reg_lambda': 3.12287658241277}. Best is trial 153 with value: 0.8441547497383001.
[I 2024-02-07 19:42:43,545] Trial 409 finished with value: 0.8395929233108153 and parameters: {'num_leaves': 40, 'learning_rate': 0.09530315129609584, 'n_estimators': 194, 'min_child_samples': 15, 'colsample_bytree': 0.7006035895264149, 'reg_alpha': 0.309214003906851, 'reg_lambda': 0.198648

KeyboardInterrupt: 

In [268]:
best_param = study.best_params
best_param['verbose'] = -1
best_param['random_state'] = seed
best_param['objective'] = 'binary'
best_param['verbose'] = -1
best_param['scale_pos_weight'] = rate
best_param

{'num_leaves': 39,
 'learning_rate': 0.09211688808142465,
 'n_estimators': 192,
 'min_child_samples': 10,
 'colsample_bytree': 0.7003587174473991,
 'reg_alpha': 0.9874391587667168,
 'reg_lambda': 0.003041692669265003,
 'verbose': -1,
 'random_state': 42,
 'objective': 'binary',
 'scale_pos_weight': 11.073593073593074}

In [269]:
best_models = []
for all_x_train, all_y_train in zip(all_x_trains, all_y_trains):
    best_model = LGBMClassifier(**best_param)
    best_model.fit(all_x_train, all_y_train)
    best_models.append(best_model)

In [270]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [271]:
train_pred = np.mean([best_model.predict_proba(df_train.drop("is_converted",axis=1).fillna(0)) for best_model in best_models], axis=0)
y = df_train['is_converted']
threshold = min([num[1] if y[i] else 1 for i, num in enumerate(train_pred)])
train_y_pred = [True if i[1]>=threshold else False for i in train_pred]
get_clf_eval(df_train['is_converted'], train_y_pred)

오차행렬:
 [[ 4620     0]
 [ 2611 48549]]

정확도: 0.9532
정밀도: 0.6389
재현율: 1.0000
F1: 0.7797


In [272]:
train_y_pred = [True if i[1]>=0.5 else False for i in train_pred]
get_clf_eval(df_train['is_converted'], train_y_pred)

오차행렬:
 [[ 4599    21]
 [ 1532 49628]]

정확도: 0.9722
정밀도: 0.7501
재현율: 0.9955
F1: 0.8555


In [273]:
x_train=df_train.drop("is_converted",axis=1).fillna(0)
y_train = df_train["is_converted"]
f1_sum = 0
for i in range(k_fold_num):
    pred = best_models[i].predict(all_x_vals[i])
    get_clf_eval(all_y_vals[i], pred)
    f1_sum += f1_score(all_y_vals[i], pred, labels=[True, False])
print(f'\n\n{f1_sum/k_fold_num}')

오차행렬:
 [[ 862   62]
 [ 366 9866]]

정확도: 0.9616
정밀도: 0.7020
재현율: 0.9329
F1: 0.8011
오차행렬:
 [[ 825   99]
 [ 395 9837]]

정확도: 0.9557
정밀도: 0.6762
재현율: 0.8929
F1: 0.7696
오차행렬:
 [[ 853   71]
 [ 415 9817]]

정확도: 0.9564
정밀도: 0.6727
재현율: 0.9232
F1: 0.7783
오차행렬:
 [[ 830   94]
 [ 411 9821]]

정확도: 0.9547
정밀도: 0.6688
재현율: 0.8983
F1: 0.7667
오차행렬:
 [[ 848   76]
 [ 457 9775]]

정확도: 0.9522
정밀도: 0.6498
재현율: 0.9177
F1: 0.7609


0.7753224864895871


In [274]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1).fillna(0)

In [275]:
# 예측 수행 (soft voting)
print(format(threshold,'f'))
a_pred = [best_model.predict_proba(x_test) for best_model in best_models]
pred = np.mean(a_pred, axis=0)
pred = [True if i[1]>threshold else False for i in pred]
sum(pred)

0.351002


1586

In [276]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [277]:
real = pd.read_csv('73per.csv')
get_clf_eval(real['is_converted'], pred)

오차행렬:
 [[1572  582]
 [  14 3103]]

정확도: 0.8869
정밀도: 0.9912
재현율: 0.7298
F1: 0.8406
