In [20]:
!pip3 install lightgbm scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn
# 모델을 위해 필요한 라이브러리 설치(우수 예시에만 있는 부분)



[0m

In [21]:
k_fold_num = 5
seed=42

In [22]:
import pandas as pd
import sklearn
import numpy as np
from lightgbm import LGBMClassifier
import random
import os
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
import warnings
warnings.filterwarnings('ignore')


In [23]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(seed) # Seed 고정

In [24]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [25]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    
        
def manage_label_encoding(df_train, df_test):
    tmp = len(df_train)
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

    for col in label_columns:
        df_all[col] = label_encoding(df_all[col])
    
    for col in label_columns:  
        df_train[col] = df_all.iloc[: tmp][col]
        df_test[col] = df_all.iloc[tmp :][col]

    return df_train, df_test

def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',  # merge into 'less than 3 months'
        'being followed up': 'being followed up.',  # merge into 'being followed up.'
        'less than 6 months': '3 months ~ 6 months',  # merge into '3 months ~ 6 months'
        '3_months_~_6_months' : '3 months ~ 6 months'
    })
    df_train['expected_timeline'] = df_train['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',  # merge into 'less than 3 months'
        'being followed up': 'being followed up.',  # merge into 'being followed up.'
        'less than 6 months': '3 months ~ 6 months',  # merge into '3 months ~ 6 months'
        '3_months_~_6_months' : '3 months ~ 6 months'
    })
    
    top_expected_timelines = df_train['expected_timeline'].value_counts().nlargest(6).index
    df_all['expected_timeline'] = df_all['expected_timeline'].apply(lambda x: x if x in top_expected_timelines else 'Others')
    
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    return df_train, df_test
df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = category_all(df_train,df_test)

In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55780 entries, 0 to 55779
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   bant_submit              55780 non-null  float64 
 1   customer_country         55780 non-null  category
 2   business_unit            55780 non-null  category
 3   com_reg_ver_win_rate     55780 non-null  float64 
 4   customer_idx             55780 non-null  int64   
 5   customer_type            55780 non-null  category
 6   enterprise               55780 non-null  category
 7   historical_existing_cnt  55780 non-null  float64 
 8   id_strategic_ver         55780 non-null  float64 
 9   it_strategic_ver         55780 non-null  float64 
 10  idit_strategic_ver       55780 non-null  float64 
 11  customer_job             55780 non-null  category
 12  lead_desc_length         55780 non-null  int64   
 13  inquiry_type             55780 non-null  category
 14  product_cat

In [28]:
rate = ((len(df_train)-sum(df_train['is_converted']))/sum(df_train['is_converted']))
def custom_split(df_train, num=5, idx=0, smote=False, false_rate=1):
    global rate

    #shuffle
    true_df_train = sklearn.utils.shuffle(df_train[df_train["is_converted"]==True], random_state=seed)
    false_df_train = sklearn.utils.shuffle(df_train[df_train["is_converted"]==False], random_state=seed)
    
    #k-fold num
    slice_num = len(true_df_train) // num
    
    #True:False = 1:1
    tmp_train = pd.concat([true_df_train[0:slice_num*idx],
                           true_df_train[slice_num*(idx+1):],
                           false_df_train[0:int((slice_num*false_rate)*idx)],
                           false_df_train[int((slice_num*false_rate)*(idx+1)):]])
    tmp_val = pd.concat([true_df_train[slice_num*idx:slice_num*(idx+1)],
                         false_df_train[int((slice_num*false_rate)*idx):int((slice_num*false_rate)*(idx+1))]])
    #shuffle
    tmp_train = sklearn.utils.shuffle(tmp_train, random_state=seed)
    tmp_val = sklearn.utils.shuffle(tmp_val, random_state=seed)

    #split x and y
    x_train, y_train = tmp_train.drop("is_converted",axis=1), tmp_train["is_converted"]
    x_val, y_val = tmp_val.drop("is_converted",axis=1), tmp_val["is_converted"]
    
    if(smote):
        sm = SMOTE(random_state=seed)
        x_train, y_train = sm.fit_resample(x_train, y_train)
    
    return x_train, y_train, x_val, y_val


all_x_trains = []
all_y_trains = []
all_x_vals = []
all_y_vals = []
for idx in range(k_fold_num):
    bundle = custom_split(df_train, num=k_fold_num, idx=idx, smote=False, false_rate=rate)
    all_x_trains.append(bundle[0])
    all_y_trains.append(bundle[1])
    all_x_vals.append(bundle[2])
    all_y_vals.append(bundle[3])

In [29]:
# params = {
#         'num_leaves': trial.suggest_int('num_leaves', 20,40), 
#         'max_depth': -1,
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), 
#         'n_estimators': trial.suggest_int('n_estimators', 50,200), 
#         'scale_pos_weight':rate,
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 80), 
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
#         'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
#         'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
#         'objective':'binary',
#         'random_state': seed,
#         'type':'gpu',
#         'verbose':-1,
#         'n_jobs':-1,
#     }

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20,40), 
        'max_depth': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), 
        'n_estimators': trial.suggest_int('n_estimators', 50,200), 
        'scale_pos_weight':rate,
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 80), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'objective':'binary',
        'random_state': seed,
        'type':'gpu',
        'verbose':-1,
        'n_jobs':-1,
    }
    
    score = []
    for all_x_train, all_y_train, all_x_val, all_y_val in zip(all_x_trains, all_y_trains, all_x_vals, all_y_vals):
        clf = LGBMClassifier(**params)
        clf.fit(all_x_train, all_y_train, feature_name='auto', categorical_feature = 'auto', )
        
        y_pred = clf.predict(all_x_val)
        y_true = all_y_val
        score.append((f1_score(y_true,y_pred,labels=[True,False]) + recall_score(y_true,y_pred))/2)
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=None)

[I 2024-02-07 23:32:42,797] A new study created in memory with name: no-name-ac6a6e86-c283-4b5a-9700-517c75cceb93
[I 2024-02-07 23:32:44,426] Trial 0 finished with value: 0.8174005731833329 and parameters: {'num_leaves': 27, 'learning_rate': 0.0951207163345817, 'n_estimators': 160, 'min_child_samples': 50, 'colsample_bytree': 0.7468055921327309, 'reg_alpha': 0.15599452033620265, 'reg_lambda': 0.5808361216819946}. Best is trial 0 with value: 0.8174005731833329.
[I 2024-02-07 23:32:46,382] Trial 1 finished with value: 0.811614082020571 and parameters: {'num_leaves': 38, 'learning_rate': 0.06051038616257767, 'n_estimators': 156, 'min_child_samples': 6, 'colsample_bytree': 0.9909729556485982, 'reg_alpha': 0.8324426408004217, 'reg_lambda': 2.1233911067827616}. Best is trial 0 with value: 0.8174005731833329.
[I 2024-02-07 23:32:47,308] Trial 2 finished with value: 0.7748214999039584 and parameters: {'num_leaves': 23, 'learning_rate': 0.01915704647548995, 'n_estimators': 95, 'min_child_sample

[I 2024-02-07 23:33:20,603] Trial 23 finished with value: 0.8156926417650661 and parameters: {'num_leaves': 34, 'learning_rate': 0.08983954790675995, 'n_estimators': 152, 'min_child_samples': 80, 'colsample_bytree': 0.7720235478621211, 'reg_alpha': 0.32319077934642837, 'reg_lambda': 3.153235668875576}. Best is trial 0 with value: 0.8174005731833329.
[I 2024-02-07 23:33:21,784] Trial 24 finished with value: 0.810259348078349 and parameters: {'num_leaves': 28, 'learning_rate': 0.05622901200725067, 'n_estimators': 104, 'min_child_samples': 71, 'colsample_bytree': 0.7273220767955038, 'reg_alpha': 0.0020509222467936516, 'reg_lambda': 0.11659823073936887}. Best is trial 0 with value: 0.8174005731833329.
[I 2024-02-07 23:33:24,301] Trial 25 finished with value: 0.8166371228558174 and parameters: {'num_leaves': 37, 'learning_rate': 0.07274744072977374, 'n_estimators': 131, 'min_child_samples': 59, 'colsample_bytree': 0.8581773546522253, 'reg_alpha': 0.15014749200811028, 'reg_lambda': 1.5733268

[I 2024-02-07 23:34:02,760] Trial 47 finished with value: 0.8138002869105495 and parameters: {'num_leaves': 22, 'learning_rate': 0.09975144644171624, 'n_estimators': 158, 'min_child_samples': 6, 'colsample_bytree': 0.8096310110161492, 'reg_alpha': 0.24062824856887163, 'reg_lambda': 7.4232813634123325}. Best is trial 46 with value: 0.817493543789908.
[I 2024-02-07 23:34:03,961] Trial 48 finished with value: 0.8086106268004869 and parameters: {'num_leaves': 21, 'learning_rate': 0.08378547843462288, 'n_estimators': 136, 'min_child_samples': 22, 'colsample_bytree': 0.8355123560345178, 'reg_alpha': 0.27924942234950884, 'reg_lambda': 1.775959033882839}. Best is trial 46 with value: 0.817493543789908.
[I 2024-02-07 23:34:05,403] Trial 49 finished with value: 0.8145132985289203 and parameters: {'num_leaves': 23, 'learning_rate': 0.07319028540874534, 'n_estimators': 150, 'min_child_samples': 5, 'colsample_bytree': 0.7945192330776911, 'reg_alpha': 0.12819592256350065, 'reg_lambda': 2.59189219160

[I 2024-02-07 23:34:46,819] Trial 71 finished with value: 0.8175133750211139 and parameters: {'num_leaves': 38, 'learning_rate': 0.09080660744250252, 'n_estimators': 183, 'min_child_samples': 29, 'colsample_bytree': 0.7395214766865522, 'reg_alpha': 0.34935860739756264, 'reg_lambda': 3.4826985974940934}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:34:49,142] Trial 72 finished with value: 0.8175540028245036 and parameters: {'num_leaves': 39, 'learning_rate': 0.09128370825608169, 'n_estimators': 186, 'min_child_samples': 30, 'colsample_bytree': 0.7442525734321122, 'reg_alpha': 0.3692045534294375, 'reg_lambda': 3.4710825540850223}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:34:51,135] Trial 73 finished with value: 0.8157919835257399 and parameters: {'num_leaves': 37, 'learning_rate': 0.09728133075080796, 'n_estimators': 172, 'min_child_samples': 37, 'colsample_bytree': 0.7069452067559144, 'reg_alpha': 0.49470018428818774, 'reg_lambda': 4.3441325

[I 2024-02-07 23:35:39,988] Trial 95 finished with value: 0.8176768413836368 and parameters: {'num_leaves': 38, 'learning_rate': 0.09522088138870208, 'n_estimators': 190, 'min_child_samples': 19, 'colsample_bytree': 0.7461592988675835, 'reg_alpha': 0.3872318573109201, 'reg_lambda': 2.8292042100477848}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:35:41,955] Trial 96 finished with value: 0.8158432857742113 and parameters: {'num_leaves': 39, 'learning_rate': 0.08362908025854403, 'n_estimators': 184, 'min_child_samples': 24, 'colsample_bytree': 0.7119457828653948, 'reg_alpha': 0.10360275086763902, 'reg_lambda': 3.053033310037748}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:35:43,765] Trial 97 finished with value: 0.8146539330304815 and parameters: {'num_leaves': 40, 'learning_rate': 0.0891076314175422, 'n_estimators': 169, 'min_child_samples': 12, 'colsample_bytree': 0.7357342768250354, 'reg_alpha': 0.2711942118982726, 'reg_lambda': 1.4301678167

[I 2024-02-07 23:36:21,776] Trial 119 finished with value: 0.8174389166811655 and parameters: {'num_leaves': 38, 'learning_rate': 0.07526880847202597, 'n_estimators': 182, 'min_child_samples': 30, 'colsample_bytree': 0.7175149426715572, 'reg_alpha': 0.3547636545637351, 'reg_lambda': 1.742715332748829}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:36:23,509] Trial 120 finished with value: 0.8184340789500792 and parameters: {'num_leaves': 31, 'learning_rate': 0.09578646025770517, 'n_estimators': 197, 'min_child_samples': 32, 'colsample_bytree': 0.7020580792081009, 'reg_alpha': 0.42206937324678195, 'reg_lambda': 2.85473390493158}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:36:25,223] Trial 121 finished with value: 0.8168602624409133 and parameters: {'num_leaves': 31, 'learning_rate': 0.09596014775171674, 'n_estimators': 191, 'min_child_samples': 32, 'colsample_bytree': 0.7074879640699778, 'reg_alpha': 0.3979964425935091, 'reg_lambda': 2.48614357

[I 2024-02-07 23:37:07,250] Trial 143 finished with value: 0.8169027743902479 and parameters: {'num_leaves': 40, 'learning_rate': 0.04441121507443595, 'n_estimators': 183, 'min_child_samples': 8, 'colsample_bytree': 0.7339463533057308, 'reg_alpha': 0.48409792259167944, 'reg_lambda': 6.9228090321031255}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:37:09,251] Trial 144 finished with value: 0.8165292351086972 and parameters: {'num_leaves': 38, 'learning_rate': 0.05354780797262938, 'n_estimators': 197, 'min_child_samples': 10, 'colsample_bytree': 0.7466194696444725, 'reg_alpha': 0.4501890340291301, 'reg_lambda': 0.806197436559237}. Best is trial 62 with value: 0.8198483720840374.
[I 2024-02-07 23:37:10,994] Trial 145 finished with value: 0.8136159783039829 and parameters: {'num_leaves': 29, 'learning_rate': 0.050944089176379596, 'n_estimators': 193, 'min_child_samples': 5, 'colsample_bytree': 0.818331306617822, 'reg_alpha': 0.5413528447556695, 'reg_lambda': 2.61588573

[I 2024-02-07 23:37:55,985] Trial 167 finished with value: 0.8151706368399084 and parameters: {'num_leaves': 37, 'learning_rate': 0.05730480899255881, 'n_estimators': 188, 'min_child_samples': 20, 'colsample_bytree': 0.7279654623720566, 'reg_alpha': 0.4807878370379124, 'reg_lambda': 1.8924118244533101}. Best is trial 163 with value: 0.8203972011375779.
[I 2024-02-07 23:37:57,902] Trial 168 finished with value: 0.8152423125679128 and parameters: {'num_leaves': 36, 'learning_rate': 0.09086384867682037, 'n_estimators': 179, 'min_child_samples': 25, 'colsample_bytree': 0.738161227237905, 'reg_alpha': 0.38777843921573474, 'reg_lambda': 1.213346267684896}. Best is trial 163 with value: 0.8203972011375779.
[I 2024-02-07 23:38:00,687] Trial 169 finished with value: 0.8147967179039373 and parameters: {'num_leaves': 38, 'learning_rate': 0.09446548000324358, 'n_estimators': 182, 'min_child_samples': 28, 'colsample_bytree': 0.7569563345942781, 'reg_alpha': 0.4114431203443037, 'reg_lambda': 3.06141

[I 2024-02-07 23:38:54,112] Trial 191 finished with value: 0.8175764869832907 and parameters: {'num_leaves': 35, 'learning_rate': 0.06785043106121386, 'n_estimators': 189, 'min_child_samples': 33, 'colsample_bytree': 0.7091414294246835, 'reg_alpha': 0.4644807285607045, 'reg_lambda': 1.643812785445303}. Best is trial 163 with value: 0.8203972011375779.
[I 2024-02-07 23:38:56,690] Trial 192 finished with value: 0.8194357114968718 and parameters: {'num_leaves': 35, 'learning_rate': 0.0679499192273949, 'n_estimators': 184, 'min_child_samples': 28, 'colsample_bytree': 0.7036505400510636, 'reg_alpha': 0.43890162504458513, 'reg_lambda': 2.6226862437459864}. Best is trial 163 with value: 0.8203972011375779.
[I 2024-02-07 23:38:58,950] Trial 193 finished with value: 0.816577418861602 and parameters: {'num_leaves': 35, 'learning_rate': 0.07345829451075725, 'n_estimators': 191, 'min_child_samples': 31, 'colsample_bytree': 0.7020244631972221, 'reg_alpha': 0.44501081662479325, 'reg_lambda': 2.64929

[W 2024-02-07 23:39:43,014] Trial 213 failed with value None.


KeyboardInterrupt: 

In [30]:
best_param = study.best_params
best_param['verbose'] = -1
best_param['random_state'] = seed
best_param['objective'] = 'binary'
best_param['verbose'] = -1
best_param['scale_pos_weight'] = rate
best_param

{'num_leaves': 38,
 'learning_rate': 0.09405113587997047,
 'n_estimators': 191,
 'min_child_samples': 17,
 'colsample_bytree': 0.7367765197862572,
 'reg_alpha': 0.3822819694158223,
 'reg_lambda': 2.291905899445273,
 'verbose': -1,
 'random_state': 42,
 'objective': 'binary',
 'scale_pos_weight': 11.073593073593074}

In [31]:
best_models = []
for all_x_train, all_y_train in zip(all_x_trains, all_y_trains):
    best_model = LGBMClassifier(**best_param)
    best_model.fit(all_x_train, all_y_train, feature_name='auto', categorical_feature = 'auto',)
    best_models.append(best_model)

In [32]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [34]:
train_pred = np.mean([best_model.predict_proba(df_train.drop("is_converted",axis=1)) for best_model in best_models], axis=0)
y = df_train['is_converted']
threshold = min([num[1] if y[i] else 1 for i, num in enumerate(train_pred)])
train_y_pred = [True if i[1]>=threshold else False for i in train_pred]
get_clf_eval(df_train['is_converted'], train_y_pred)

오차행렬:
 [[ 4620     0]
 [ 1175 49985]]

정확도: 0.9789
정밀도: 0.7972
재현율: 1.0000
F1: 0.8872


In [35]:
train_y_pred = [True if i[1]>=0.5 else False for i in train_pred]
get_clf_eval(df_train['is_converted'], train_y_pred)

오차행렬:
 [[ 4616     4]
 [  853 50307]]

정확도: 0.9846
정밀도: 0.8440
재현율: 0.9991
F1: 0.9151


In [36]:
x_train=df_train.drop("is_converted",axis=1)
y_train = df_train["is_converted"]
f1_sum = 0
for i in range(k_fold_num):
    pred = best_models[i].predict(all_x_vals[i])
    get_clf_eval(all_y_vals[i], pred)
    f1_sum += f1_score(all_y_vals[i], pred, labels=[True, False])
print(f'\n\n{f1_sum/k_fold_num}')

오차행렬:
 [[ 801  123]
 [ 256 9976]]

정확도: 0.9660
정밀도: 0.7578
재현율: 0.8669
F1: 0.8087
오차행렬:
 [[ 752  172]
 [ 269 9963]]

정확도: 0.9605
정밀도: 0.7365
재현율: 0.8139
F1: 0.7733
오차행렬:
 [[ 792  132]
 [ 294 9938]]

정확도: 0.9618
정밀도: 0.7293
재현율: 0.8571
F1: 0.7881
오차행렬:
 [[ 800  124]
 [ 275 9957]]

정확도: 0.9642
정밀도: 0.7442
재현율: 0.8658
F1: 0.8004
오차행렬:
 [[ 785  139]
 [ 303 9929]]

정확도: 0.9604
정밀도: 0.7215
재현율: 0.8496
F1: 0.7803


0.7901450516258053


In [37]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [41]:
# 예측 수행 (soft voting)
print(format(threshold,'f'))
a_pred = [best_model.predict_proba(x_test) for best_model in best_models]
pred = np.mean(a_pred, axis=0)
pred = [True if i[1]>threshold else False for i in pred]
sum(pred)

0.396372


1310

In [42]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [40]:
real = pd.read_csv('73per.csv')
get_clf_eval(real['is_converted'], pred)

오차행렬:
 [[1256  898]
 [  54 3063]]

정확도: 0.8194
정밀도: 0.9588
재현율: 0.5831
F1: 0.7252
