In [1]:
# *** version History 
# ------------------- Done -------------------------
# ver1. : dafault -> metric = "F1"
# ver2. : Changing training metric  --->  from 'F1' to "AUC"
#   => Max_iter=3000 까지 수렴하지 않고 계속 학습되는 현상 발생 
#      -> Parameter Optimisation for searching Global minimum issue 
#   => 성능 개선 
#      -> 확실히, 'AUC' 최대 기준의 model을 찾고, F1-maximaisation threshold 가 현재 채점기준에서 우수 
#          -> 데이터가, 'label' 균형 관점에서 sampling 되어 -> 'Indepent' 불균형이 큰 문제에서도 
#              Threshold 하향 조정이 성능 개선을 이끈 것으로 연관지어 의심해볼 만 함. 
# ver3. : A 속성에 대한 사람 == 컨텐츠 bool 컬럼 추가 
#    ver3.1 : E (순서형, 0~11)속성에 대한 비교 추가 + 
#    ver3.2 : E 속성 순서형에 대한, Binning 조정 
# ver4. : 순서형 자료에 대한, numeric cols 반영 
# ver4. : (*Validity 탐색 필요*, 열람시간대, 열람 요일별 판별력 반영) 
#      -> temporal based-profiling => 단일 변수 분포로는 너무나 판별력(Distinctive power)이 적어보임. 
#    # ver6. : 똑같이 타고 들어가서, 속성 2개 짜리 별도로 모형에 추가해서 
#    #   -> (모형 1: 여러 변수) & (모형 2: 두개 짜리 변수) 에 대한 앙상블해서 스코어 보기 
#    #       => 0.70257 
#    # ver7. : '열람일시'에 대한 변수 분할로 catboost 태우기 + ver6. 과 앙상블 하기 
#    #       => 0.70285
# ----------------- 2nd approaches --------------------
# ver8. : 파생변수 생성 + best-model-architecture로 적합 
#   -> (생성한 속성 pool에서) Feature_selection 하기 
# ver9. : 연관규칙분석 -> Metric Exploration 

---------------------------------
### * Validity Test for Data Re-sampling 

제공된 데이터셋은 'Label' 기준으로, 너무나 잘 균형잡힌 상태(i.e., 열람: 미열람 = 거의 50: 50).   
학습 후, Predicted label 결정을 위한 threshold 하향 조정 시에 성능이 더 좋은 현상으로도,   
test-set 에서의 label 불균형을 고려해볼 수 있고, 직관적으로도 열람, 미열람 비율이 제공된   
데이터셋 만큼이나 균형잡혀 있게 보기 어려울 것으로 판단   
>  __독립변수 기준에서의 데이터 re-sampling이, 학습-모형 일반화 측면에서 개선을 보이는지 테스트 수행__ 

---------------------------------

* 결과적으로는 ... - 
    * 모수에 해당할 원-데이터 분포에 대해서 추정 근거가 없이, 특정&일부 독립변수 기준의 up/re-sampling은   
        오히려 분포에 대한 왜곡이 커지고 -> 그래서 더욱이, 모델의 학습이 왜곡된 분포에 깊이 최적화 하면서 예측 성능이 더욱 하락했음.    
        -> 즉, (Data-resampling + Boosting) 조합이, (특정 변수에 치우친 분포 왜곡 + 그 분포에 대한 과적합) 으로 이어짐. 
-------

### Drive amount (For Colab)

In [3]:
# from google.colab import drive 
# drive.mount('./content', force_remount=True)

In [4]:
# cd ./content/MyDrive/Colab\ Notebooks

In [5]:
# cd ./01.JobCare/scripts/

### Set Global variables   

In [6]:
# DATA_PATH = "/content/drive/MyDrive/dacon/job_care/data/"
# SUBMIT_PATH = "/content/drive/MyDrive/dacon/job_care/submit/"

DATA_PATH = '../data/JobCare_data/'
SUBMIT_PATH = '../submission/'
SEED = 100    # Seed for reproducibility 

### catboost install

In [7]:
# !pip install catboost

### Library import 

In [8]:
# !pip install seaborn
# !pip install torch
# !pip install imblearn
# !pip install mlxtend

In [9]:
import os
import sys
import platform
import random
import math
import copy 
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool, CatBoostClassifier 
from catboost import FeaturesData

import mlxtend
from mlxtend.preprocessing import TransactionEncoder 
from mlxtend.frequent_patterns import fpgrowth

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.init as init

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.11.0-46-generic-x86_64-with-debian-bullseye-sid
- python: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) 
[GCC 9.4.0]
- pandas: 1.1.5
- numpy: 1.21.5
- sklearn: 1.0.1


In [10]:
# Config for pandas displaying options 
pd.options.display.max_columns = 80

--------------------
* __Data Wrangling & Get master tables__  
--------------------

#### Load Dataset

In [11]:
# Original dataset  
data_dir = '../data/JobCare_data/'

# path for Prediction.csv  
subm_dir = '../submission/'

In [12]:
train = pd.read_csv(data_dir+'train.csv')
test = pd.read_csv(data_dir+'test.csv')

# Feature codes 
feature_D_code = pd.read_csv(data_dir+'속성_D_코드.csv')
feature_H_code = pd.read_csv(data_dir+'속성_H_코드.csv')
feature_L_code = pd.read_csv(data_dir+'속성_L_코드.csv')

#### Data handling 

In [13]:
# # drop 'Unnamed:5'
# feature_D_code = feature_D_code.loc[:, feature_D_code.columns.str.contains('속성 D')]

# Rename Columns
D_code_map = {
    '속성 D 코드': 'D_ALL', 
    '속성 D 세분류코드': 'D_DET', 
    '속성 D 소분류코드': 'D_SML', 
    '속성 D 중분류코드': 'D_MED', 
    '속성 D 대분류코드': 'D_LAG'
}

H_code_map = {
    '속성 H 코드': 'H_ALL', 
    '속성 H 중분류코드': 'H_MED', 
    '속성 H 대분류코드': 'H_LAG'
}

L_code_map = {
    '속성 L 코드': 'L_ALL', 
    '속성 L 세분류코드': 'L_DET', 
    '속성 L 소분류코드': 'L_SML', 
    '속성 L 중분류코드': 'L_MED', 
    '속성 L 대분류코드': 'L_LAG' 
}

feature_D_code.columns = feature_D_code.columns.map(D_code_map)
feature_H_code.columns = feature_H_code.columns.map(H_code_map)
feature_L_code.columns = feature_L_code.columns.map(L_code_map)

In [14]:
# Datetime handling (str to dt)
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'], format="%Y-%m-%d %H:%M:%S")
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'], format="%Y-%m-%d %H:%M:%S")

In [15]:
print(f" * Train-set 'contents_open_dt' Min(): {train['contents_open_dt'].min()},  Max():{train['contents_open_dt'].max()}")
print(f" *  test-set 'contents_open_dt' Min(): {test['contents_open_dt'].min()},  Max():{test['contents_open_dt'].max()}")

 * Train-set 'contents_open_dt' Min(): 2020-01-01 00:01:03,  Max():2020-11-30 23:59:56
 *  test-set 'contents_open_dt' Min(): 2020-12-01 00:00:07,  Max():2020-12-31 23:59:08


#### * Master tables

In [16]:
# Create person & contents master tables
person_train = train[train.columns[train.columns.str.contains('person_')]]
contnt_train = train[train.columns[train.columns.str.contains('contents_')]]

# >>> (~person_train.duplicated()).sum() 
# 300,177    # -> same w/ nunique(person_rn) 
# >>> (~(contnt_train.drop('contents_open_dt', axis=1).duplicated())).sum()
# 283,393    # -> same w/ nunique(contents_rn) 

person_master = person_train.drop_duplicates(keep='first')
contnt_master = contnt_train.drop('contents_open_dt', axis=1).drop_duplicates(keep='first')

# Arrange columns' order 
person_master = pd.concat([person_master['person_rn'], person_master.loc[:, person_master.columns!='person_rn']], axis=1)
contnt_master = pd.concat([contnt_master['contents_rn'], contnt_master.loc[:, contnt_master.columns!='contents_rn']], axis=1)

In [17]:
# person_master

In [18]:
# contnt_master

---------------------------------
* __FeatureGenerator__ 
---------------------------------

> __def getDerivativeFeatures(df):__

In [19]:
def getDerivativeFeatures(df):
    # copy DataFrame  
    data_df = df.copy()

    # ------------------------------------
    ### * 'contents_open_dt' -> split into dt.components  
    ###    -> Col_prefix : "open_dt_"
    # ------------------------------------
    data_df['open_dt_quarter'] = data_df['contents_open_dt'].dt.quarter
    data_df['open_dt_month'] = data_df['contents_open_dt'].dt.month
    data_df['open_dt_week'] = data_df['contents_open_dt'].dt.week
    data_df['open_dt_day'] = data_df['contents_open_dt'].dt.day
    data_df['open_dt_weekday'] = data_df['contents_open_dt'].dt.weekday
    # data_df['open_dt_day_name'] = data_df['contents_open_dt'].dt.day_name().str[:3]
    data_df['open_dt_hour'] = data_df['contents_open_dt'].dt.hour
    data_df['open_dt_minute'] = data_df['contents_open_dt'].dt.minute

    # Columns' list 
    open_dt_cols = list(data_df.columns[data_df.columns.str.contains('open_dt_')])

    # ------------------------------------
    ### * All combination for each equal-attributes (: [a, a_1], [c], [j, j_1], [e])
    ###    -> Col_prefix : "deriv_"
    # ------------------------------------
    # * --- <str> ---
    # person_attribute_a & contents_attribute_a
    data_df['deriv_a_a'] = data_df['person_attribute_a'].astype(str) + '-' + data_df['contents_attribute_a'].astype(str)

    # person_attribute_a & person_attribute_a_1
    data_df['deriv_person_a_a_1'] = data_df['person_attribute_a'].astype(str) + '-' + data_df['person_attribute_a_1'].astype(str)

    # person_prefer_c & contents_attribute_c
    data_df['deriv_c_c'] = data_df['person_prefer_c'].astype(str) + '-' + data_df['contents_attribute_c'].astype(str)

    # contents_attribute_j & contents_attribute_j_1
    data_df['deriv_contents_j_j_1'] = data_df['contents_attribute_j'].astype(str) + '-' + data_df['contents_attribute_j_1'].astype(str) 

    # * --- <numeric> ---
    # person_prefer_e & contents_attribute_e 
    data_df['deriv_e_diff'] = data_df['person_prefer_e'] - data_df['contents_attribute_e']

    # Columns' list 
    deriv_cols = list(data_df.columns[data_df.columns.str.contains('deriv_')])    # * All cols 

    deriv_str_cols = deriv_cols.copy()
    deriv_str_cols.remove('deriv_e_diff')    # * str cols 
    deriv_num_cols = ['deriv_e_diff']    #  * numeric cols 
    
    deriv_cols_dict = {'deriv_str_cols': deriv_str_cols, 'deriv_num_cols': deriv_num_cols}
    
    # ------------------------------------
    ### * '..._yn T/F Combination (: d, h)
    ###    -> cols_suffix : "_yn_comb"
    # ------------------------------------
    # d/h_match_yn cols 
    d_yn_cols = data_df.columns[data_df.columns.str.contains('^d_')]
    h_yn_cols = data_df.columns[data_df.columns.str.contains('^h_')]

    # create T/F combination col 
    data_df['d_yn_comb'] = pd.concat([data_df[yn_col_].astype(str).str[0] for yn_col_ in d_yn_cols], axis=1).T.sum()
    data_df['h_yn_comb'] = pd.concat([data_df[yn_col_].astype(str).str[0] for yn_col_ in h_yn_cols], axis=1).T.sum()

    # expend more combination for 2-feats 
    data_df['d+h_yn_comb'] = 'D:' + data_df['d_yn_comb'] + '/H:' + data_df['h_yn_comb'] 

    # Columns' list 
    yn_comb_cols = list(data_df.columns[data_df.columns.str.contains('_yn_comb')])    # * All cols 

    # ------------------------------------
    ### * "person_prefer_{H, D}_{1, 2, 3}" - Coherency 
    ###    -> cols_format : "deriv_{D_LAG, D_SML, H_MED, etc.}_sim"
    # ------------------------------------
    # *feat- code handling 
    # -----------------------
    feat_D = feature_D_code.astype(str).copy()
    feat_H = feature_H_code.astype(str).copy()
    feat_L = feature_L_code.astype(str).copy()

    feat_D['D_MED'] = feat_D['D_LAG']+'-'+feat_D['D_MED']
    feat_D['D_SML'] = feat_D['D_MED']+'-'+feat_D['D_SML']
    feat_D['D_DET'] = feat_D['D_SML']+'-'+feat_D['D_DET']

    feat_H['H_MED'] = feat_H['H_LAG']+'-'+feat_H['H_MED']

    feat_L['L_MED'] = feat_L['L_LAG']+'-'+feat_L['L_MED']
    feat_L['L_SML'] = feat_L['L_MED']+'-'+feat_L['L_SML']
    feat_L['L_DET'] = feat_L['L_SML']+'-'+feat_L['L_DET']

    # * "deriv_{attribute_class_lv}_sim"
    # -----------------------
    # Bucket for deriv 
    data_df_0 = data_df.copy()
    deriv_feat_df = [data_df_0]
    deriv_feat_cols_dict = {}    # Columns' dict 

    # * person_prefer_{d, h}_{1, 2, 3} Comparison cols 
    for feat_, feat_df, cat_lvs_ in zip(["D", 'H'], [feat_D, feat_H], [['D_LAG', 'D_MED', 'D_SML'], ['H_LAG', 'H_MED']]): 
        person_prefer_cols = data_df.columns[data_df.columns.str.contains(f'person_prefer_{feat_.lower()}')]
        # cat_lvs = ['H_LAG', 'H_MED']

        person_prefer_frame = []

        # for cat_lv_ in cat_lvs: 
        for cat_lv_ in cat_lvs_: 
            # Mapping dictionary
            cat_lv_mapper = {rec_[f'{feat_}_ALL'] : rec_[cat_lv_] for _, rec_ in feat_df[[f'{feat_}_ALL', cat_lv_]].iterrows()} 
            # Prepare the [prefer_{1, 2, 3}]
            person_prefer_view = data_df[person_prefer_cols].astype(str).copy()
            # feat - by target_level - converting  
            person_prefer_cnvt = pd.concat([person_prefer_view[col_].map(cat_lv_mapper) for col_ in person_prefer_cols], axis=1)
            # Bool for equal-class - <bool> 
            frames = [(person_prefer_cnvt.iloc[:, lf_]==person_prefer_cnvt.iloc[:, rt_]).rename(f"deriv_{cat_lv_}_{str(lf_+1)}{str(rt_+1)}") for lf_, rt_ in [(0, 1), (0, 2), (1, 2)]]
            person_prefer_lv_yn = pd.concat(frames, axis=1)

            # Similarity Strength -<numeric> 
            person_prefer_lv_yn[f"deriv_{cat_lv_}_sim"] = person_prefer_lv_yn.sum(axis=1)
            person_prefer_frame.append(person_prefer_lv_yn)    # Append for "a-feat-by-each-lvs"

        person_prefer_deriv_df = pd.concat(person_prefer_frame, axis=1)    # Concat for "a-feat-by-all-lvs" 

        # Columns' list 
        mask_for_num_ = person_prefer_deriv_df.columns.str.contains("_sim")

        deriv_feat_cols_dict[f'{feat_}_bool_cols'] = list(person_prefer_deriv_df.columns[~mask_for_num_])
        deriv_feat_cols_dict[f'{feat_}_num_cols'] = list(person_prefer_deriv_df.columns[mask_for_num_])

        deriv_feat_df.append(person_prefer_deriv_df)

    # * concat all-feat 
    data_df = pd.concat(deriv_feat_df, axis=1)

    # ------------------------------------
    ### * {[Person-Content] suitability} & {D/H "Preference-coherency"} 
    ###    -> cols_suffix : "_sim_yn_comb"
    # ------------------------------------

    # d/h_match_yn cols 
    d_sim_cols = data_df.columns[data_df.columns.str.contains('D_..._sim', regex=True)]
    h_sim_cols = data_df.columns[data_df.columns.str.contains('H_..._sim', regex=True)]

    # T-F Combination 
    d_yn_comb_with_sim = pd.concat([data_df['d_yn_comb']+'_'+data_df[d_sim_col_].astype(str) for d_sim_col_ in d_sim_cols], axis=1)
    h_yn_comb_with_sim = pd.concat([data_df['h_yn_comb']+'_'+data_df[h_sim_col_].astype(str) for h_sim_col_ in h_sim_cols], axis=1)

    d_yn_comb_with_sim.columns = d_sim_cols+'_yn_comb'
    h_yn_comb_with_sim.columns = h_sim_cols+'_yn_comb'

    # append all  
    data_df = pd.concat([data_df, d_yn_comb_with_sim, h_yn_comb_with_sim], axis=1)

    # Columns' list 
    sim_yn_comb_cols = d_yn_comb_with_sim.columns.tolist() +  h_yn_comb_with_sim.columns.tolist() 
    
    # ------------------------------------
    #### * 'contents_attribute_l' -> split into all class-lv
    ###    -> cols_format : "contents_attribute_L_{LAG, MED}"
    # ------------------------------------
    # convert to 'L_LAG' code
    attr_l_mapper = {r_[1]['L_ALL']: r_[1]['L_LAG'] for r_ in feat_L[['L_ALL', 'L_LAG']].iterrows()}
    data_df['contents_attribute_L_LAG'] = data_df['contents_attribute_l'].astype(str).map(attr_l_mapper)

    # convert to 'L_MED' code 
    attr_l_mapper = {r_[1]['L_ALL']: r_[1]['L_MED'] for r_ in feat_L[['L_ALL', 'L_MED']].iterrows()}
    data_df['contents_attribute_L_MED'] = data_df['contents_attribute_l'].astype(str).map(attr_l_mapper)

    # Columns' list 
    attr_l_cols = ['contents_attribute_L_LAG', 'contents_attribute_L_MED']
    
    # ----------------------------------- 
    all_deriv_cols_dict = {
        "open_dt_cols": open_dt_cols, 
        # "deriv_cols_dict": deriv_cols_dict, 
        'deriv_str_cols': deriv_str_cols, 
        'deriv_num_cols': deriv_num_cols,
        "yn_comb_cols": yn_comb_cols, 
        # "deriv_feat_cols_dict": deriv_feat_cols_dict, 
        "sim_yn_comb_cols": sim_yn_comb_cols, 
        "attr_l_cols": attr_l_cols
    } 
    
    all_deriv_cols_dict.update(deriv_cols_dict) 
    all_deriv_cols_dict.update(deriv_feat_cols_dict) 
    
    # ----------------------------------- 
    return(data_df, all_deriv_cols_dict) 


In [20]:
train_data, deriv_cols = getDerivativeFeatures(train)
test_data, _ = getDerivativeFeatures(test)

  # This is added back by InteractiveShellApp.init_path()


In [21]:
print(f" * train: {train.shape}  ->  {train_data.shape}  ")
print(f" * test : {test.shape}  ->  {test_data.shape}  ")

 * train: (501951, 35)  ->  (501951, 77)  
 * test : (46404, 34)  ->  (46404, 76)  


In [22]:
# Derivatives info
colnum_max = max([len(cols) for cols in deriv_cols.values()])

deriv_cols_info = pd.DataFrame.from_dict(deriv_cols, orient='index', columns=[f"col_{i+1}" for i in range(colnum_max)])

deriv_cols_info = deriv_cols_info.T

In [23]:
deriv_cols_info

Unnamed: 0,open_dt_cols,deriv_str_cols,deriv_num_cols,yn_comb_cols,sim_yn_comb_cols,attr_l_cols,D_bool_cols,D_num_cols,H_bool_cols,H_num_cols
col_1,open_dt_quarter,deriv_a_a,deriv_e_diff,d_yn_comb,deriv_D_LAG_sim_yn_comb,contents_attribute_L_LAG,deriv_D_LAG_12,deriv_D_LAG_sim,deriv_H_LAG_12,deriv_H_LAG_sim
col_2,open_dt_month,deriv_person_a_a_1,,h_yn_comb,deriv_D_MED_sim_yn_comb,contents_attribute_L_MED,deriv_D_LAG_13,deriv_D_MED_sim,deriv_H_LAG_13,deriv_H_MED_sim
col_3,open_dt_week,deriv_c_c,,d+h_yn_comb,deriv_D_SML_sim_yn_comb,,deriv_D_LAG_23,deriv_D_SML_sim,deriv_H_LAG_23,
col_4,open_dt_day,deriv_contents_j_j_1,,,deriv_H_LAG_sim_yn_comb,,deriv_D_MED_12,,deriv_H_MED_12,
col_5,open_dt_weekday,,,,deriv_H_MED_sim_yn_comb,,deriv_D_MED_13,,deriv_H_MED_13,
col_6,open_dt_hour,,,,,,deriv_D_MED_23,,deriv_H_MED_23,
col_7,open_dt_minute,,,,,,deriv_D_SML_12,,,
col_8,,,,,,,deriv_D_SML_13,,,
col_9,,,,,,,deriv_D_SML_23,,,


---------------------------------
* __Upsampling__   
---------------------------------

In [24]:
import imblearn

In [25]:
from imblearn.under_sampling import *
from imblearn.over_sampling import *

In [26]:
train_origin = copy.deepcopy(train)

-------------------

### Category \#31: Personalities 

-----------------------------

In [27]:
train

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0
3,3,False,False,False,True,False,False,2,0,2,5,703,705,704,3,1,1,72,227,2,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0
4,4,True,True,True,False,False,False,1,3,4,5,275,370,369,4,1,1,214,210,209,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,501946,False,False,False,True,False,False,1,1,2,2,1192,935,1228,3,1,1,59,4,95,3,3,5,1,1,2,354,147,1,5,65,503156,285850,2020-03-13 12:55:52,1
501947,501947,True,True,False,True,False,False,1,6,2,1,118,113,110,4,1,1,105,142,95,3,3,10,2,1,2,163,120,1,4,142,676255,456996,2020-01-20 11:51:51,1
501948,501948,True,True,True,True,False,False,1,7,4,1,147,46,145,4,1,1,59,127,139,3,1,5,1,1,2,438,147,2,7,65,484528,293258,2020-08-05 17:27:24,1
501949,501949,True,False,False,True,False,False,1,1,2,1,46,147,145,4,1,1,251,49,258,3,2,5,1,1,2,660,147,3,4,259,456330,273797,2020-06-15 09:23:21,1


In [28]:
num_cols = ['person_attribute_a_1', 'person_attribute_b', 'person_prefer_e', 'contents_attribute_e']
cat_cols = ['person_attribute_a', 'person_prefer_c', 'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       # 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_h']
bol_cols = ['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn', 'h_m_match_yn', 'h_s_match_yn']
idx_cols = ['id', 'person_rn', 'contents_rn'] 

mis_cols = ['contents_open_dt']

trgt_col = ['target']

In [29]:
train[num_cols] = train[num_cols].astype(float)
train[cat_cols] = train[cat_cols].astype(str)
train[bol_cols] = train[bol_cols].astype(int).astype(str)
train[idx_cols] = train[idx_cols].astype(str)
train[mis_cols] = train[mis_cols].astype(str)
train[trgt_col] = train[trgt_col].astype(str)

In [30]:
# x_feats = num_cols + cat_cols + bol_cols + idx_cols + mis_cols + trgt_col
x_feats = num_cols + cat_cols + bol_cols + trgt_col

# ------------------------------
upsamp_feat = 'd_s_match_yn'
y_feat = x_feats.pop(x_feats.index(upsamp_feat))

# ------------------------------

X = train[x_feats].values
y = train[y_feat]

In [31]:
X_samp, y_samp = SMOTE(random_state=4).fit_resample(X, y)

In [32]:
X_samp.shape

(853388, 28)

In [33]:
y_samp.shape

(853388,)

In [34]:
train_ups = pd.DataFrame(X_samp, columns=x_feats)
train_ups[upsamp_feat] = y_samp

In [35]:
train_ups = train_ups.round()

In [36]:
# train_ups[num_cols] = train_ups[num_cols].astype(int).astype(float)
# train_ups[cat_cols] = train_ups[cat_cols].astype(int).astype(str)
# train_ups[bol_cols] = train_ups[bol_cols].astype(int).astype(bool)
# # train_ups[idx_cols] = train_ups[idx_cols].astype(str)
# # train_ups[mis_cols] = train_ups[mis_cols].astype(str)
# train_ups[trgt_col] = train_ups[trgt_col].astype(int).astype(str)

----------------------------------------
* Adoptation

## Load data

In [37]:
# train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

# code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv').iloc[:,:-1]
code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv')
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

# train_data.shape , test_data.shape

In [38]:
train_data = copy.deepcopy(train_ups)

### rename columns for handling 

In [39]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
# code_h.columns= ["attribute_h","attribute_h_p"]
code_h.columns= ["attribute_h","attribute_h_m","attribute_h_l"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

### Merge features for Class Comparison

In [40]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

## Function for Data-Preprocessing 

In [41]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    # df = df.drop(columns=cols_drop)
    contain_cols = df.columns[~df.columns.isin(cols_drop)]
    df = df[contain_cols]
        
    return (df , y_data)

## Set Colnames

In [42]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    # ("contents_attribute_e","person_prefer_e"),    # disable - Ver. 3-1 

    ("person_prefer_d_1_attribute_d_s" , "contents_attribute_d_attribute_d_s"),   # Additional 
    ("person_prefer_d_1_attribute_d_m" , "contents_attribute_d_attribute_d_m"),   # Additional 
    ("person_prefer_d_1_attribute_d_l" , "contents_attribute_d_attribute_d_l"),   # Additional 
    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    # ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    # ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
    ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
    ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"), 
    # Additional attr_'A' - in ver3. / in ver5.
    ("contents_attribute_a","person_attribute_a")
    
]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

## Preprocessing for Train/Test-set 

In [43]:
x_train, y_train = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

((853388, 73), (853388,), (46404, 74))

### Ver 3.1 - 속성 e 에 대한 수치형 편차 비교 

In [44]:
# attr_e_cols = x_train.columns[x_train.columns.str.contains('_e')]
# x_train[attr_e_cols]
# attr_e_diff = x_train['person_prefer_e'] - x_train['contents_attribute_e']
# attr_e_diff_abs = (attr_e_diff).abs()

# 사람/컨텐츠 사이 e에 대한 거리 값만 보존 
x_train['attribute_e_diff_abs'] = (x_train['person_prefer_e'] - x_train['contents_attribute_e']).abs()
x_test['attribute_e_diff_abs'] = (x_test['person_prefer_e'] - x_test['contents_attribute_e']).abs()
# x_train['attribute_e_diff_abs'] = (x_train['person_prefer_e'] - x_train['contents_attribute_e'])
# x_test['attribute_e_diff_abs'] = (x_test['person_prefer_e'] - x_test['contents_attribute_e'])
# 원본 수치는 모두 제거 
del x_train['person_prefer_e']
del x_train['contents_attribute_e']
del x_test['person_prefer_e']
del x_test['contents_attribute_e']

In [45]:
x_train['attribute_e_diff_abs']

0         4.0
1         0.0
2         1.0
3         0.0
4         0.0
         ... 
853383    2.0
853384    0.0
853385    1.0
853386    1.0
853387    1.0
Name: attribute_e_diff_abs, Length: 853388, dtype: float64

## Feature Grouping 

In [46]:
# cat_features = x_train.columns[x_train.nunique() > 2].tolist()    # (>) 말고 (>=) 이거도 테스트 해야 함, (==2 <= 경우는, True/False도 포함 )
cat_features = x_train.columns[x_train.nunique() > 2].tolist()    # - in ver5  

# * 명목형/ 순서형 구분하기 - in ver4. 
# ordinal_feats = ['person_attribute_a_1', 'person_attribute_b', 'person_prefer_e', 'contents_prefer_e']
# * 명목형/ 순서형 구분하기 - in ver3.1  
# ordinal_feats = ['attribute_e_diff_abs']
# * 명목형/ 순서형 구분하기 - in ver5  
ordinal_feats = ['person_attribute_a_1', 'person_attribute_b', 'attribute_e_diff_abs']

# 범주형 속성에서, 수치형 속성 제거 
cat_features = list(set(cat_features) - set(ordinal_feats))

print(len(cat_features))    # <- 사용되는 범주형 변수 갯수 
# cat_features = x_train.columns[x_train.nunique() > 2].tolist()

43


In [47]:
# ordinal_feats -> to -> 'float' type 
x_train[ordinal_feats] = x_train[ordinal_feats].astype(np.float32)
x_test[ordinal_feats] = x_test[ordinal_feats].astype(np.float32)

In [49]:
# x_train = x_train.round(0)
# x_test = x_test.round(0)

## Parameter setting 

In [50]:
is_holdout = False
n_splits = 5
iterations = 10000
# iterations = 3600
patience = 50
learning_rate=0.075

max_ctr_complexity = 6
min_data_in_leaf = 10

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

## Training

In [51]:
# help(CatBoostClassifier)

In [52]:
# scores = []
scores_F1 = []
scores_AUC = []
models = []

for tri, vai in cv.split(x_train):
    print("="*50)
    # # -----------------------------------------------------------
    # # * Assign train/valid set -> CatBoost.FeatureData()
    # x_tra = x_train.iloc[tri].copy() 
    # train_labels = y_train[tri].copy() 
    # x_val = x_train.iloc[vai].copy() 
    # valid_labels = y_train[vai].copy() 
    
    # train_data = FeaturesData(
    #     num_feature_data=x_tra[ordinal_feats].astype(np.float32).values,
    #     cat_feature_data=x_tra[cat_features].astype('object').values,
    #     num_feature_names=ordinal_feats, 
    #     cat_feature_names=cat_features
    #     )
    
    # valid_data = FeaturesData(
    #     num_feature_data=x_val[ordinal_feats].astype(np.float32).values,
    #     cat_feature_data=x_val[cat_features].astype('object').values,
    #     num_feature_names=ordinal_feats, 
    #     cat_feature_names=cat_features
    #     )
    # # -----------------------------------------------------------
    # -----------------------------------------------------------
    # * Assign train/valid set -> CatBoost.FeatureData()
    x_tra = x_train.iloc[tri].copy() 
    train_labels = y_train[tri].copy() 
    x_val = x_train.iloc[vai].copy() 
    valid_labels = y_train[vai].copy() 
    
    train_data = Pool(
        data=FeaturesData(
            num_feature_data=x_tra[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_tra[ordinal_feats].values,
            # cat_feature_data=x_tra[cat_features].astype('object').values,    # [342, 0, ...], dtype=object <- 에러남
            cat_feature_data=x_tra[cat_features].astype(str).values,    # ['342', '0', ...], dtype=object <- 속도 많이 걸림 
            # cat_feature_data=x_tra[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=train_labels
    )
    
    valid_data = Pool(
        data=FeaturesData(
            num_feature_data=x_val[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_val[ordinal_feats].values,
            # cat_feature_data=x_val[cat_features].astype('object').values,
            cat_feature_data=x_val[cat_features].astype(str).values,
            # cat_feature_data=x_val[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=valid_labels
    )
    # -----------------------------------------------------------
    
    # list for prediction 
    preds = []

    # * ver.2 : metric = 'AUC'
    # model = CatBoostClassifier(iterations=iterations,random_state=SEED,task_type="GPU",eval_metric="AUC",cat_features=cat_features,one_hot_max_size=4)
    model = CatBoostClassifier(iterations=iterations, 
                               random_state=SEED, 
                               task_type="GPU", 
                               eval_metric="AUC", 
                               one_hot_max_size=4,
                               # ----------------------------
                               learning_rate=learning_rate,
                               max_ctr_complexity=max_ctr_complexity, 
                               min_data_in_leaf=min_data_in_leaf
                              )
    # model = CatBoostClassifier(iterations=iterations, random_state=SEED, # task_type="GPU", 
    #                            eval_metric="AUC", cat_features=cat_features, one_hot_max_size=4)
    
    # model.fit(x_train.iloc[tri], y_train[tri], 
    # model.fit(train_data, train_labels, 
    model.fit(train_data, 
            # eval_set=[(x_train.iloc[vai], y_train[vai])], 
            # eval_set=[(valid_data, valid_labels)], 
            eval_set=valid_data, 
            early_stopping_rounds=patience, 
            verbose = 400
        )
    
    models.append(model)
    # scores_F1.append(model.get_best_score()["validation"]["F1"])
    # scores_F1.append(f1_score(y_train[vai], model.predict(x_train.iloc[vai])))
    scores_F1.append(f1_score(valid_labels, model.predict(valid_data)))
    scores_AUC.append(model.get_best_score()["validation"]["AUC"])
    
    if is_holdout:
        break    

0:	learn: 0.6096936	test: 0.6109032	best: 0.6109032 (0)	total: 86ms	remaining: 14m 20s
400:	learn: 0.7604676	test: 0.8004113	best: 0.8004113 (400)	total: 34.5s	remaining: 13m 45s
800:	learn: 0.7709754	test: 0.8091072	best: 0.8091072 (800)	total: 1m 8s	remaining: 13m 11s
1200:	learn: 0.7773415	test: 0.8128812	best: 0.8128909 (1197)	total: 1m 43s	remaining: 12m 36s
1600:	learn: 0.7819937	test: 0.8146911	best: 0.8146911 (1600)	total: 2m 17s	remaining: 12m 1s
2000:	learn: 0.7860966	test: 0.8160514	best: 0.8160514 (2000)	total: 2m 51s	remaining: 11m 26s
2400:	learn: 0.7899029	test: 0.8170342	best: 0.8170342 (2400)	total: 3m 25s	remaining: 10m 51s
2800:	learn: 0.7932531	test: 0.8177810	best: 0.8177814 (2799)	total: 4m	remaining: 10m 17s
3200:	learn: 0.7966535	test: 0.8187046	best: 0.8187068 (3198)	total: 4m 34s	remaining: 9m 43s
3600:	learn: 0.7996124	test: 0.8191912	best: 0.8191996 (3594)	total: 5m 8s	remaining: 9m 8s
4000:	learn: 0.8025452	test: 0.8195798	best: 0.8195871 (3992)	total: 5m 4

## CV Results 

In [57]:
print(f" * scores_F1 : {scores_F1}")
print(f" * The Avg. of F1 scores : {np.mean(scores_F1)}")

print(f" * scores_AUC : {scores_AUC}")
print(f" * The Avg. of AUC scores : {np.mean(scores_AUC)}")
#  * scores_F1 : [0.6842252192412471, 0.6844616302110387, 0.6835157199930519, 0.6803709668031276, 0.6793908668805676]
#  * The Avg. of F1 scores : 0.6823928806258065
#  * scores_AUC : [0.7385962903499603, 0.7373865246772766, 0.7384682893753052, 0.7354912161827087, 0.735406219959259]
#  * The Avg. of AUC scores : 0.7370697081089019

 * scores_F1 : [0.7680229525299948, 0.7675531415823108, 0.7672518885126335, 0.7686766621515723, 0.7666229631017043]
 * The Avg. of F1 scores : 0.7676255215756431
 * scores_AUC : [0.8208600282669067, 0.8210325539112091, 0.8198061585426331, 0.8216703236103058, 0.81931471824646]
 * The Avg. of AUC scores : 0.820536756515503


### Threshold Optimization 

In [59]:
scores_by_thres_ = [] 

for i,(tri, vai) in enumerate( cv.split(x_train)):
    # pred_proba = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    # -----------------------------------------------------------
    # * Assign train/valid set -> CatBoost.FeatureData()
    x_tra = x_train.iloc[tri].copy() 
    train_labels = y_train[tri].copy() 
    x_val = x_train.iloc[vai].copy() 
    valid_labels = y_train[vai].copy() 
    
    train_data = Pool(
        data=FeaturesData(
            num_feature_data=x_tra[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_tra[ordinal_feats].values,
            # cat_feature_data=x_tra[cat_features].astype('object').values,    # [342, 0, ...], dtype=object <- 에러남
            cat_feature_data=x_tra[cat_features].astype(str).values,    # ['342', '0', ...], dtype=object <- 속도 많이 걸림 
            # cat_feature_data=x_tra[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=train_labels
    )
    
    valid_data = Pool(
        data=FeaturesData(
            num_feature_data=x_val[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_val[ordinal_feats].values,
            # cat_feature_data=x_val[cat_features].astype('object').values,
            cat_feature_data=x_val[cat_features].astype(str).values,
            # cat_feature_data=x_val[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=valid_labels
    )
    # -----------------------------------------------------------
    pred_proba = models[i].predict_proba(valid_data)[:, 1]

    sco_by_thres_ = []
    for thrs_ in range(10, 90, 5):    # threshold : 0.1 ~ 0.85 

        threshold = thrs_/100
        pred = np.where(pred_proba >= threshold , 1, 0)
        score = f1_score(y_train[vai],pred)

        sco_by_thres_.append(score)
    
    scores_by_thres_.append(sco_by_thres_)


In [60]:
# scores_by_thres_ = [] 

# for i,(tri, vai) in enumerate( cv.split(x_train)):
#     pred_proba = models[i].predict_proba(x_train.iloc[vai])[:, 1]

#     sco_by_thres_ = []
#     for thrs_ in range(10, 90, 5):    # threshold : 0.1 ~ 0.85 

#         threshold = thrs_/100
#         pred = np.where(pred_proba >= threshold , 1, 0)
#         score = f1_score(y_train[vai],pred)

#         sco_by_thres_.append(score)

#     scores_by_thres_.append(sco_by_thres_)


In [61]:
f1_thres = pd.DataFrame(scores_by_thres_, columns=[f"thres_{str(i/100)}"for i in range(10, 90, 5)])    # threshold : 0.1 ~ 0.85 
f1_thres = f1_thres.T
f1_thres
# f1_thres = pd.DataFrame(scores_by_thres_, columns=[f"thres_{str(i/100)}"for i in range(10, 90, 5)])    # threshold : 0.1 ~ 0.85 
# f1_thres = f1_thres.T
# # f1_thres
# f1_thres.idxmax()

Unnamed: 0,0,1,2,3,4
thres_0.1,0.724333,0.724156,0.726402,0.727557,0.72435
thres_0.15,0.735332,0.735323,0.7376,0.73859,0.735454
thres_0.2,0.746393,0.745858,0.747961,0.748624,0.746095
thres_0.25,0.756926,0.755785,0.757597,0.758852,0.755995
thres_0.3,0.765784,0.764678,0.766417,0.767069,0.764617
thres_0.35,0.773051,0.771821,0.773734,0.774347,0.772436
thres_0.4,0.775933,0.775487,0.776853,0.777843,0.775808
thres_0.45,0.775242,0.775099,0.775334,0.776493,0.774595
thres_0.5,0.768023,0.767553,0.767252,0.768677,0.766623
thres_0.55,0.750433,0.750153,0.750391,0.752209,0.749522


In [62]:
f1_thres.idxmax()

0    thres_0.4
1    thres_0.4
2    thres_0.4
3    thres_0.4
4    thres_0.4
dtype: object

In [64]:
scores_by_thres_ = [] 

for i,(tri, vai) in enumerate( cv.split(x_train)):
    # -----------------------------------------------------------
    # * Assign train/valid set -> CatBoost.FeatureData()
    x_tra = x_train.iloc[tri].copy() 
    train_labels = y_train[tri].copy() 
    x_val = x_train.iloc[vai].copy() 
    valid_labels = y_train[vai].copy() 
    
    train_data = Pool(
        data=FeaturesData(
            num_feature_data=x_tra[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_tra[ordinal_feats].values,
            # cat_feature_data=x_tra[cat_features].astype('object').values,    # [342, 0, ...], dtype=object <- 에러남
            cat_feature_data=x_tra[cat_features].astype(str).values,    # ['342', '0', ...], dtype=object <- 속도 많이 걸림 
            # cat_feature_data=x_tra[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=train_labels
    )
    
    valid_data = Pool(
        data=FeaturesData(
            num_feature_data=x_val[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_val[ordinal_feats].values,
            # cat_feature_data=x_val[cat_features].astype('object').values,
            cat_feature_data=x_val[cat_features].astype(str).values,
            # cat_feature_data=x_val[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=valid_labels
    )
    # -----------------------------------------------------------
    pred_proba = models[i].predict_proba(valid_data)[:, 1]

    sco_by_thres_ = []
    for thrs_ in range(300, 400, 5):    # threshold : 0.3 ~ 0.395 

        threshold = thrs_/1000
        pred = np.where(pred_proba >= threshold , 1, 0)
        score = f1_score(y_train[vai],pred)

        sco_by_thres_.append(score)
    
    scores_by_thres_.append(sco_by_thres_)


In [63]:
# scores_by_thres_ = [] 

# for i,(tri, vai) in enumerate( cv.split(x_train)):
#     pred_proba = models[i].predict_proba(x_train.iloc[vai])[:, 1]

#     sco_by_thres_ = []
#     for thrs_ in range(300, 400, 5):    # threshold : 0.3 ~ 0.395 

#         threshold = thrs_/1000
#         pred = np.where(pred_proba >= threshold , 1, 0)
#         score = f1_score(y_train[vai],pred)

#         sco_by_thres_.append(score)
    
#     scores_by_thres_.append(sco_by_thres_)


In [65]:
f1_thres = pd.DataFrame(scores_by_thres_, columns=[f"thres_{str(i/1000)}"for i in range(300, 400, 5)])    # threshold : 0.1 ~ 0.85 
f1_thres = f1_thres.T
# f1_thres
f1_thres.idxmax()

0    thres_0.395
1     thres_0.39
2    thres_0.395
3    thres_0.395
4    thres_0.395
dtype: object

In [66]:
# f1_thres.idxmax()

In [67]:
thresholds_list = [float(idx_.split('_')[-1]) for idx_ in f1_thres.idxmax()]
thresholds_list

[0.395, 0.39, 0.395, 0.395, 0.395]

## adjusting threshold 

In [68]:
# threshold = 0.4

# thresholds_list = [float(idx_.split('_')[-1]) for idx_ in f1_thres.idxmax()]
np.mean(thresholds_list)

0.394

# Check result of adjusting threshold

In [None]:
pred_list = []
scores = []

thresholds_list = [float(idx_.split('_')[-1]) for idx_ in f1_thres.idxmax()]
# >>> [0.37, 0.36, 0.36, 0.35, 0.37]

test_data = Pool(
    data=FeaturesData(
        num_feature_data=x_test[ordinal_feats].astype(np.float32).values,
        cat_feature_data=x_test[cat_features].astype(str).values,
        num_feature_names=ordinal_feats, 
        cat_feature_names=cat_features
        )
)


# for i,(tri, vai) in enumerate( cv.split(x_train) ):
for i, (thres_, (tri, vai)) in enumerate(zip(thresholds_list, cv.split(x_train))):
    # -----------------------------------------------------------
    # * Assign train/valid set -> CatBoost.FeatureData()
    x_tra = x_train.iloc[tri].copy() 
    train_labels = y_train[tri].copy() 
    x_val = x_train.iloc[vai].copy() 
    valid_labels = y_train[vai].copy() 
    
    train_data = Pool(
        data=FeaturesData(
            num_feature_data=x_tra[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_tra[ordinal_feats].values,
            # cat_feature_data=x_tra[cat_features].astype('object').values,    # [342, 0, ...], dtype=object <- 에러남
            cat_feature_data=x_tra[cat_features].astype(str).values,    # ['342', '0', ...], dtype=object <- 속도 많이 걸림 
            # cat_feature_data=x_tra[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=train_labels
    )
    
    valid_data = Pool(
        data=FeaturesData(
            num_feature_data=x_val[ordinal_feats].astype(np.float32).values,
            # num_feature_data=x_val[ordinal_feats].values,
            # cat_feature_data=x_val[cat_features].astype('object').values,
            cat_feature_data=x_val[cat_features].astype(str).values,
            # cat_feature_data=x_val[cat_features].values,
            num_feature_names=ordinal_feats, 
            cat_feature_names=cat_features
            ),
        label=valid_labels
    )
    # -----------------------------------------------------------
    pred_proba = models[i].predict_proba(valid_data)[:, 1]
    # pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    # pred = np.where(pred >= threshold , 1, 0)
    pred = np.where(pred_proba >= thres_ , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(test_data)[:, 1]
    pred_list.append(pred)

print(f" * scores_F1 of Validation set : {scores}")
print(f" * The Avg. of F1 scores : {np.mean(scores)}")

#  * scores_F1 of Validation set : [0.7192216943016904, 0.7181671176291647, 0.7174627080896608, 0.7179099969797644, 0.7157041836146495]
#  * The Avg. of F1 scores : 0.717693140122986

 * scores_F1 of Validation set : [0.7757541183677096, 0.775375523729185, 0.7766848611559347, 0.7777710385241656, 0.7757345900728168]
 * The Avg. of F1 scores : 0.7762640263699623


In [70]:
# pred_list = []
# scores = []

# thresholds_list = [float(idx_.split('_')[-1]) for idx_ in f1_thres.idxmax()]
# # >>> [0.37, 0.36, 0.36, 0.35, 0.37]

# for i, (thres_, (tri, vai)) in enumerate(zip(thresholds_list, cv.split(x_train))):

#     pred_proba = models[i].predict_proba(x_train.iloc[vai])[:, 1]
#     pred = np.where(pred_proba >= thres_ , 1, 0)
#     score = f1_score(y_train[vai],pred)
#     scores.append(score)
#     pred = models[i].predict_proba(x_test)[:, 1] 
#     pred_list.append(pred)

# print(f" * scores_F1 of Validation set : {scores}")
# print(f" * The Avg. of F1 scores : {np.mean(scores)}")

In [72]:
# Model 8_component1. 
pred_proba = np.mean(pred_list , axis = 0 )
pred = np.where(pred_proba >= np.mean(thresholds_list),1 ,0)
print(f"* best Threshold : {np.mean(thresholds_list)}")

* best Threshold : 0.394


In [90]:
pd.Series(pred_proba).describe()

count    46404.000000
mean         0.633534
std          0.047330
min          0.423795
25%          0.604547
50%          0.644055
75%          0.668520
max          0.753534
dtype: float64

In [73]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')

sample_submission['target'] = pred_proba
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_proba.csv', index=False)
pred_proba_m8_c1 = pred_proba.copy()

sample_submission['target'] = pred
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_pred.csv', index=False)
pred_m8_c1 = pred.copy()

In [78]:
# Results 
sample_submission['target'].value_counts()

1    46404
Name: target, dtype: int64

In [87]:
# Threshold = 0.5 
fixed_thres = 0.5

pred_proba = np.mean(pred_list, axis = 0 )
pred = np.where(pred_proba >= fixed_thres, 1, 0)
print(f" * Fixed Threshold : {fixed_thres}")

 * Fixed Threshold : 0.5


In [88]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')

sample_submission['target'] = pred_proba
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_proba_th05.csv', index=False)
pred_proba_m8_c1 = pred_proba.copy()

sample_submission['target'] = pred
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_pred_th05.csv', index=False)
pred_m8_c1 = pred.copy()

In [89]:
# Results 
sample_submission['target'].value_counts()

1    46042
0      362
Name: target, dtype: int64

In [91]:
# Threshold = 0.6 
fixed_thres = 0.6

pred_proba = np.mean(pred_list, axis = 0 )
pred = np.where(pred_proba >= fixed_thres, 1, 0)
print(f" * Fixed Threshold : {fixed_thres}")

 * Fixed Threshold : 0.6


In [92]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')

sample_submission['target'] = pred_proba
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_proba_th06.csv', index=False)
pred_proba_m8_c1 = pred_proba.copy()

sample_submission['target'] = pred
sample_submission.to_csv(SUBMIT_PATH+f'imblearn_pred_th06.csv', index=False)
pred_m8_c1 = pred.copy()

In [97]:
# Results 
sample_submission['target'].value_counts()

1    35795
0    10609
Name: target, dtype: int64

In [96]:
# imbalanced ratio
pd.Series(y_train).value_counts()

1.0    464475
0.0    388913
dtype: int64