In [1]:
# *** version History 
# ------------------- Done -------------------------
# ver1. : dafault -> metric = "F1"
# ver2. : training metric -> "AUC"
#   => Max_iter=3000 까지 수렴하지 않고 계속 학습되는 현상 발생 
#      -> 5-fold 중에서, 3,4번째 제외하고 모두 끝이 안났음 
#         -> 3, 4번째도, 2990번 즈음에야 Shrink 함 
# ver3. : A 속성에 대한 사람==컨텐츠 bool 컬럼 추가 
#   ver3.1 : E (순서형, 0~11)속성에 대한 비교 추가 + 
#   ver3.2 : E 속성 순서형에 대한, Binning 조정 
# ver4. : 순서형 자료에 대한, numeric cols 반영 
# ver4. : (*Validity 탐색 필요*, 열람시간대, 열람 요일별 판별력 반영) 

# ver6. : 똑같이 타고 들어가서, 속성 2개 짜리 별도로 모형에 추가해서 
#   -> (모형 1: 여러 변수) & (모형 2: 두개 짜리 변수) 에 대한 앙상블해서 스코어 보기 
#       => 0.70257 
# ver7. : '열람일시'에 대한 변수 분할로 catboost 태우기 + ver6. 과 앙상블 하기 
#       => 0.70285
# ----------------- On Progress --------------------
# ver8. : 파생변수 생성 + best-model-architecture로 적합 
#   -> (생성한 속성 pool에서) Feature_selection 하기 

### Set Global variables   

In [5]:
# DATA_PATH = "/content/drive/MyDrive/dacon/job_care/data/"
# SUBMIT_PATH = "/content/drive/MyDrive/dacon/job_care/submit/"

DATA_PATH = '../data/JobCare_data/'
SUBMIT_PATH = '../submission/'
SEED = 42    # Seed for reproducibility 

### catboost install

In [6]:
# !pip install catboost

### Library import 

In [7]:
# !pip install seaborn

In [8]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool, CatBoostClassifier 
from catboost import FeaturesData

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.11.0-43-generic-x86_64-with-debian-bullseye-sid
- python: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:21) 
[GCC 9.4.0]
- pandas: 1.1.5
- numpy: 1.19.5
- sklearn: 1.0.1


--------------------
* Derivatives 
--------------------

#### Load Dataset

In [9]:
# Original dataset  
data_dir = '../data/JobCare_data/'

# path for Prediction.csv  
subm_dir = '../submission/'

In [10]:
train = pd.read_csv(data_dir+'train.csv')
test = pd.read_csv(data_dir+'test.csv')

# Feature codes 
feature_D_code = pd.read_csv(data_dir+'속성_D_코드.csv')
feature_H_code = pd.read_csv(data_dir+'속성_H_코드.csv')
feature_L_code = pd.read_csv(data_dir+'속성_L_코드.csv')

#### Data handling 

In [11]:
# # drop 'Unnamed:5'
# feature_D_code = feature_D_code.loc[:, feature_D_code.columns.str.contains('속성 D')]

# Rename Columns
D_code_map = {
    '속성 D 코드': 'D_ALL', 
    '속성 D 세분류코드': 'D_DET', 
    '속성 D 소분류코드': 'D_SML', 
    '속성 D 중분류코드': 'D_MED', 
    '속성 D 대분류코드': 'D_LAG'
}

H_code_map = {
    '속성 H 코드': 'H_ALL', 
    '속성 H 중분류코드': 'H_MED', 
    '속성 H 대분류코드': 'H_LAG'
}

L_code_map = {
    '속성 L 코드': 'L_ALL', 
    '속성 L 세분류코드': 'L_DET', 
    '속성 L 소분류코드': 'L_SML', 
    '속성 L 중분류코드': 'L_MED', 
    '속성 L 대분류코드': 'L_LAG' 
}

feature_D_code.columns = feature_D_code.columns.map(D_code_map)
feature_H_code.columns = feature_H_code.columns.map(H_code_map)
feature_L_code.columns = feature_L_code.columns.map(L_code_map)

In [12]:
# Datetime handling (str to dt)
train['contents_open_dt'] = pd.to_datetime(train['contents_open_dt'], format="%Y-%m-%d %H:%M:%S")
test['contents_open_dt'] = pd.to_datetime(test['contents_open_dt'], format="%Y-%m-%d %H:%M:%S")

In [13]:
print(f" * Train-set 'contents_open_dt' Min(): {train['contents_open_dt'].min()},  Max():{train['contents_open_dt'].max()}")
print(f" *  test-set 'contents_open_dt' Min(): {test['contents_open_dt'].min()},  Max():{test['contents_open_dt'].max()}")

 * Train-set 'contents_open_dt' Min(): 2020-01-01 00:01:03,  Max():2020-11-30 23:59:56
 *  test-set 'contents_open_dt' Min(): 2020-12-01 00:00:07,  Max():2020-12-31 23:59:08


#### * Master tables

In [14]:
# Create person & contents master tables
person_train = train[train.columns[train.columns.str.contains('person_')]]
contnt_train = train[train.columns[train.columns.str.contains('contents_')]]

# >>> (~person_train.duplicated()).sum() 
# 300,177    # -> same w/ nunique(person_rn) 
# >>> (~(contnt_train.drop('contents_open_dt', axis=1).duplicated())).sum()
# 283,393    # -> same w/ nunique(contents_rn) 

person_master = person_train.drop_duplicates(keep='first')
contnt_master = contnt_train.drop('contents_open_dt', axis=1).drop_duplicates(keep='first')

# Arrange columns' order 
person_master = pd.concat([person_master['person_rn'], person_master.loc[:, person_master.columns!='person_rn']], axis=1)
contnt_master = pd.concat([contnt_master['contents_rn'], contnt_master.loc[:, contnt_master.columns!='contents_rn']], axis=1)

In [15]:
# person_master

In [16]:
# contnt_master

---------------------------------
## * FeatureGenerator
---------------------------------

In [17]:
def getDerivativeFeatures(df):
    # copy DataFrame  
    data_df = df.copy()

    # ------------------------------------
    ### * 'contents_open_dt' -> split into dt.components  
    ###    -> Col_prefix : "open_dt_"
    # ------------------------------------
    data_df['open_dt_quarter'] = data_df['contents_open_dt'].dt.quarter
    data_df['open_dt_month'] = data_df['contents_open_dt'].dt.month
    data_df['open_dt_week'] = data_df['contents_open_dt'].dt.week
    data_df['open_dt_day'] = data_df['contents_open_dt'].dt.day
    data_df['open_dt_weekday'] = data_df['contents_open_dt'].dt.weekday
    # data_df['open_dt_day_name'] = data_df['contents_open_dt'].dt.day_name().str[:3]
    data_df['open_dt_hour'] = data_df['contents_open_dt'].dt.hour
    data_df['open_dt_minute'] = data_df['contents_open_dt'].dt.minute

    # Columns' list 
    open_dt_cols = list(data_df.columns[data_df.columns.str.contains('open_dt_')])

    # ------------------------------------
    ### * All combination for each equal-attributes (: [a, a_1], [c], [j, j_1], [e])
    ###    -> Col_prefix : "derivatives_"
    # ------------------------------------
    # * --- <str> ---
    # person_attribute_a & contents_attribute_a
    data_df['derivatives_a_a'] = data_df['person_attribute_a'].astype(str) + '-' + data_df['contents_attribute_a'].astype(str)

    # person_attribute_a & person_attribute_a_1
    data_df['derivatives_person_a_a_1'] = data_df['person_attribute_a'].astype(str) + '-' + data_df['person_attribute_a_1'].astype(str)

    # person_prefer_c & contents_attribute_c
    data_df['derivatives_c_c'] = data_df['person_prefer_c'].astype(str) + '-' + data_df['contents_attribute_c'].astype(str)

    # contents_attribute_j & contents_attribute_j_1
    data_df['derivatives_contents_j_j_1'] = data_df['contents_attribute_j'].astype(str) + '-' + data_df['contents_attribute_j_1'].astype(str) 

    # * --- <numeric> ---
    # person_prefer_e & contents_attribute_e 
    data_df['derivatives_e_diff'] = data_df['person_prefer_e'] - data_df['contents_attribute_e']

    # Columns' list 
    derivatives_cols = list(data_df.columns[data_df.columns.str.contains('derivatives_')])    # * All cols 

    derivatives_str_cols = derivatives_cols.copy()
    derivatives_str_cols.remove('derivatives_e_diff')    # * str cols 
    derivatives_num_cols = ['derivatives_e_diff']    #  * numeric cols 
    
    derivatives_cols_dict = {'derivatives_str_cols': derivatives_str_cols, 'derivatives_num_cols': derivatives_num_cols}
    
    # ------------------------------------
    ### * '..._yn T/F Combination (: d, h)
    ###    -> cols_suffix : "_yn_comb"
    # ------------------------------------
    # d/h_match_yn cols 
    d_yn_cols = data_df.columns[data_df.columns.str.contains('^d_')]
    h_yn_cols = data_df.columns[data_df.columns.str.contains('^h_')]

    # create T/F combination col 
    data_df['d_yn_comb'] = pd.concat([data_df[yn_col_].astype(str).str[0] for yn_col_ in d_yn_cols], axis=1).T.sum()
    data_df['h_yn_comb'] = pd.concat([data_df[yn_col_].astype(str).str[0] for yn_col_ in h_yn_cols], axis=1).T.sum()

    # expend more combination for 2-feats 
    data_df['d+h_yn_comb'] = 'D:' + data_df['d_yn_comb'] + '/H:' + data_df['h_yn_comb'] 

    # Columns' list 
    yn_comb_cols = list(data_df.columns[data_df.columns.str.contains('_yn_comb')])    # * All cols 

    # ------------------------------------
    ### * "person_prefer_{H, D}_{1, 2, 3}" - Coherency 
    ###    -> cols_format : "derivatives_{D_LAG, D_SML, H_MED, etc.}_sim"
    # ------------------------------------
    # *feat- code handling 
    # -----------------------
    feat_D = feature_D_code.astype(str).copy()
    feat_H = feature_H_code.astype(str).copy()
    feat_L = feature_L_code.astype(str).copy()

    feat_D['D_MED'] = feat_D['D_LAG']+'-'+feat_D['D_MED']
    feat_D['D_SML'] = feat_D['D_MED']+'-'+feat_D['D_SML']
    feat_D['D_DET'] = feat_D['D_SML']+'-'+feat_D['D_DET']

    feat_H['H_MED'] = feat_H['H_LAG']+'-'+feat_H['H_MED']

    feat_L['L_MED'] = feat_L['L_LAG']+'-'+feat_L['L_MED']
    feat_L['L_SML'] = feat_L['L_MED']+'-'+feat_L['L_SML']
    feat_L['L_DET'] = feat_L['L_SML']+'-'+feat_L['L_DET']

    # * "derivatives_{attribute_class_lv}_sim"
    # -----------------------
    # Bucket for derivatives 
    data_df_0 = data_df.copy()
    derivatives_feat_df = [data_df_0]
    derivatives_feat_cols_dict = {}    # Columns' dict 

    # * person_prefer_{d, h}_{1, 2, 3} Comparison cols 
    for feat_, feat_df, cat_lvs_ in zip(["D", 'H'], [feat_D, feat_H], [['D_LAG', 'D_MED', 'D_SML'], ['H_LAG', 'H_MED']]): 
        person_prefer_cols = data_df.columns[data_df.columns.str.contains(f'person_prefer_{feat_.lower()}')]
        # cat_lvs = ['H_LAG', 'H_MED']

        person_prefer_frame = []

        # for cat_lv_ in cat_lvs: 
        for cat_lv_ in cat_lvs_: 
            # Mapping dictionary
            cat_lv_mapper = {rec_[f'{feat_}_ALL'] : rec_[cat_lv_] for _, rec_ in feat_df[[f'{feat_}_ALL', cat_lv_]].iterrows()} 
            # Prepare the [prefer_{1, 2, 3}]
            person_prefer_view = data_df[person_prefer_cols].astype(str).copy()
            # feat - by target_level - converting  
            person_prefer_cnvt = pd.concat([person_prefer_view[col_].map(cat_lv_mapper) for col_ in person_prefer_cols], axis=1)
            # Bool for equal-class - <bool> 
            frames = [(person_prefer_cnvt.iloc[:, lf_]==person_prefer_cnvt.iloc[:, rt_]).rename(f"derivatives_{cat_lv_}_{str(lf_+1)}{str(rt_+1)}") for lf_, rt_ in [(0, 1), (0, 2), (1, 2)]]
            person_prefer_lv_yn = pd.concat(frames, axis=1)

            # Similarity Strength -<numeric> 
            person_prefer_lv_yn[f"derivatives_{cat_lv_}_sim"] = person_prefer_lv_yn.sum(axis=1)
            person_prefer_frame.append(person_prefer_lv_yn)    # Append for "a-feat-by-each-lvs"

        person_prefer_deriv_df = pd.concat(person_prefer_frame, axis=1)    # Concat for "a-feat-by-all-lvs" 

        # Columns' list 
        mask_for_num_ = person_prefer_deriv_df.columns.str.contains("_sim")

        derivatives_feat_cols_dict[f'{feat_}_bool_cols'] = list(person_prefer_deriv_df.columns[~mask_for_num_])
        derivatives_feat_cols_dict[f'{feat_}_num_cols'] = list(person_prefer_deriv_df.columns[mask_for_num_])

        derivatives_feat_df.append(person_prefer_deriv_df)

    # * concat all-feat 
    data_df = pd.concat(derivatives_feat_df, axis=1)

    # ------------------------------------
    ### * {[Person-Content] suitability} & {D/H "Preference-coherency"} 
    ###    -> cols_suffix : "_sim_yn_comb"
    # ------------------------------------

    # d/h_match_yn cols 
    d_sim_cols = data_df.columns[data_df.columns.str.contains('D_..._sim', regex=True)]
    h_sim_cols = data_df.columns[data_df.columns.str.contains('H_..._sim', regex=True)]

    # T-F Combination 
    d_yn_comb_with_sim = pd.concat([data_df['d_yn_comb']+'_'+data_df[d_sim_col_].astype(str) for d_sim_col_ in d_sim_cols], axis=1)
    h_yn_comb_with_sim = pd.concat([data_df['h_yn_comb']+'_'+data_df[h_sim_col_].astype(str) for h_sim_col_ in h_sim_cols], axis=1)

    d_yn_comb_with_sim.columns = d_sim_cols+'_yn_comb'
    h_yn_comb_with_sim.columns = h_sim_cols+'_yn_comb'

    # append all  
    data_df = pd.concat([data_df, d_yn_comb_with_sim, h_yn_comb_with_sim], axis=1)

    # Columns' list 
    sim_yn_comb_cols = d_yn_comb_with_sim.columns.tolist() +  h_yn_comb_with_sim.columns.tolist() 
    
    # ------------------------------------
    #### * 'contents_attribute_l' -> split into all class-lv
    ###    -> cols_format : "contents_attribute_L_{LAG, MED}"
    # ------------------------------------
    # convert to 'L_LAG' code
    attr_l_mapper = {r_[1]['L_ALL']: r_[1]['L_LAG'] for r_ in feat_L[['L_ALL', 'L_LAG']].iterrows()}
    data_df['contents_attribute_L_LAG'] = data_df['contents_attribute_l'].astype(str).map(attr_l_mapper)

    # convert to 'L_MED' code 
    attr_l_mapper = {r_[1]['L_ALL']: r_[1]['L_MED'] for r_ in feat_L[['L_ALL', 'L_MED']].iterrows()}
    data_df['contents_attribute_L_MED'] = data_df['contents_attribute_l'].astype(str).map(attr_l_mapper)

    # Columns' list 
    attr_l_cols = ['contents_attribute_L_LAG', 'contents_attribute_L_MED']
    
    # ----------------------------------- 
    all_derivatives_cols_dict = {
        "open_dt_cols": open_dt_cols, 
        "derivatives_cols_dict": derivatives_cols_dict, 
        "yn_comb_cols": yn_comb_cols, 
        "derivatives_feat_cols_dict": derivatives_feat_cols_dict, 
        "sim_yn_comb_cols": sim_yn_comb_cols, 
        "attr_l_cols": attr_l_cols
    }
    # ----------------------------------- 
    return(data_df, all_derivatives_cols_dict)


In [18]:
print(train.shape)
print(test.shape)

(501951, 35)
(46404, 34)


In [19]:
train_data, deriv_cols = getDerivativeFeatures(train)
test_data, _ = getDerivativeFeatures(test)

  # This is added back by InteractiveShellApp.init_path()


In [20]:
print(train_data.shape)
print(test_data.shape)

(501951, 77)
(46404, 76)


----------------------------
* Baseline 
----------------------------

In [21]:
def preprocess_data(
    df:pd.DataFrame, is_train:bool=True, 
    # cols_merge:List[Tuple[str,pd.DataFrame]]=[], 
    # cols_equi:List[Tuple[str,str]]=[],
    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
):
# )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()
    y_data = None
    
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    # for col, df_code in cols_merge:
    #     df = merge_codes(df,df_code,col)

    # * Bool -> int 
    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    # for col1, col2 in cols_equi:
    #     df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    # * Excluded cols 
    df = df.drop(columns=cols_drop)
    
    return(df, y_data)

# Set Colnames

In [22]:
# # 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
# cols_merge = [
#               ("person_prefer_d_1" , code_d),
#               ("person_prefer_d_2" , code_d),
#               ("person_prefer_d_3" , code_d),
#               ("contents_attribute_d" , code_d),
#               ("person_prefer_h_1" , code_h),
#               ("person_prefer_h_2" , code_h),
#               ("person_prefer_h_3" , code_h),
#               ("contents_attribute_h" , code_h),
#               ("contents_attribute_l" , code_l),
# ]

# # 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
# cols_equi = [

#     ("contents_attribute_c","person_prefer_c"),
#     ("contents_attribute_e","person_prefer_e"),    # disable - Ver. 3-1 # Return in model 6 

#     ("person_prefer_d_1_attribute_d_s" , "contents_attribute_d_attribute_d_s"),   # Additional 
#     ("person_prefer_d_1_attribute_d_m" , "contents_attribute_d_attribute_d_m"),   # Additional 
#     ("person_prefer_d_1_attribute_d_l" , "contents_attribute_d_attribute_d_l"),   # Additional 
#     ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
#     ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
#     ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
#     ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
#     ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
#     ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

#     # ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
#     # ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
#     # ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
#     ("person_prefer_h_1_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
#     ("person_prefer_h_1_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
#     ("person_prefer_h_2_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
#     ("person_prefer_h_2_attribute_h_l" , "contents_attribute_h_attribute_h_l"),
#     ("person_prefer_h_3_attribute_h_m" , "contents_attribute_h_attribute_h_m"),
#     ("person_prefer_h_3_attribute_h_l" , "contents_attribute_h_attribute_h_l"), 
#     # Additional attr_'A' - in ver3. / in ver5.
#     ("contents_attribute_a","person_attribute_a")
    
# ]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "person_rn", "contents_rn"]

# Preprocessing for Train/Test-set 

In [23]:
# x_train, y_train = preprocess_data(train_data, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
# x_test, _ = preprocess_data(test_data, is_train=False, cols_merge=cols_merge, cols_equi=cols_equi, cols_drop=cols_drop)
x_train, y_train = preprocess_data(train_data, cols_drop=cols_drop)
x_test, _ = preprocess_data(test_data, is_train=False, cols_drop=cols_drop)

x_train.shape , y_train.shape , x_test.shape

((501951, 70), (501951,), (46404, 70))

# 범주형 컬럼 리스트(catboost 파라미터에 넣을 용도)

In [24]:
# pd.options.display.max_rows = 100
# pd.options.display.max_columns = 100

In [25]:
x_train.nunique()

d_l_match_yn                      2
d_m_match_yn                      2
d_s_match_yn                      2
h_l_match_yn                      2
h_m_match_yn                      2
                                 ..
derivatives_D_SML_sim_yn_comb    12
derivatives_H_LAG_sim_yn_comb    12
derivatives_H_MED_sim_yn_comb    12
contents_attribute_L_LAG         21
contents_attribute_L_MED         79
Length: 70, dtype: int64

In [26]:
# ------------------------- 
### *** feature-group variation 
# ------------------------- 
### * Cardinality-based column grouping 
# 1. df.nunique().gt( 2 ) & df.nunique().le( 50 )
# 2. df.nunique().eq( 2 ) 
# 3. df.nunique().gt( 50 ) 

### * Theme-based column grouping 
# 1. Only-1-in-the-group 
#    : 
# 2. Only-a-few-in-the-group 
#    : (by, Chi2-best-features() )
# (3. every-features-and-by-the-group)
#    : 

# ------------------------- 
### *** Model evaluation 
# ------------------------- 
# 1. Direct fitting -> compare F1-metric & Optimal-Threshold
# 2. FeatureSelecton() -> Select Most Appropriate features 


In [27]:
# ------------------------- 
### *** feature-group variation 
# ------------------------- 
### * Cardinality-based column grouping 
# 1. df.nunique().gt( 2 ) & df.nunique().le( 50 )
# 2. df.nunique().eq( 2 ) 
# 3. df.nunique().gt( 50 ) 
# ------------------------- 
x_nuniq_ = x_train.nunique()

### * Cardinality-based column grouping 
CB1_feats = x_nuniq_[x_nuniq_.gt(2)&x_nuniq_.le(50)].index.tolist() 
CB2_feats = x_nuniq_[x_nuniq_.eq(2)                ].index.tolist() 
CB3_feats = x_nuniq_[x_nuniq_.gt(50)               ].index.tolist() 

# * Universal Numeric features 
ordinal_feats = ['person_attribute_a_1', 'person_attribute_b', 'person_prefer_e', 'contents_attribute_e']

# ordinal_feats -> to -> 'float' type 
x_train[ordinal_feats] = x_train[ordinal_feats].astype(np.float32)
x_test[ordinal_feats] = x_test[ordinal_feats].astype(np.float32)

# ----------- for iter_ ----------------------
# # 범주형 속성에서, 수치형 속성 제거 
# cat_features = list(set(cat_features) - set(ordinal_feats))

# print(len(cat_features))    # <- 사용되는 범주형 변수 갯수 

# 학습

In [28]:
# from catboost import FeaturesData

In [29]:
# Keep original set 
x_train_0 = x_train.copy()
y_train_0 = y_train.copy()
x_test_0  = x_test.copy()
# ------------------------------- 
# * 더 이상 x_train, y_train... 은 원본아님 

In [30]:

# * 학습 파라미터 
# ------------------------- 
is_holdout = False
n_splits = 5
iterations = 10000
# iterations = 3600
patience = 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# * 결과 담을 객체
# ------------------------- 
scores_F1_total = {}
scores_AUC_total = {}
models_total = {}

gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"]
gp_feats = [CB1_feats, CB2_feats, CB3_feats]

for name_, col_group_ in zip(gp_names, gp_feats):
    print("="*50)
    print(f" * feature-group : {name_}")
    
    # * dataset partitioning  
    # -------------------------- 
    x_train = x_train_0[col_group_].copy()
    # x_test  = x_test_0[col_group_].copy()
    y_train = y_train_0.copy()

    # Score obj. initialization 
    # -------------------------- 
    # scores = []
    scores_F1 = []
    scores_AUC = []
    models = []
    
    # * Assigning feature  
    # --------------------------     
    # 범주형 속성에서, 수치형 속성 제거 
    cat_features = list(set(col_group_) - set(ordinal_feats))
    print(f" * The # of - Total feats: {len(col_group_)} = Cat feats: {len(cat_features)} + Num feats: {len(col_group_)-len(cat_features)} ") 
    print("="*50)

    # * Training with 5-folds cv 
    # -------------------------- 
    for tri, vai in cv.split(x_train):
        # print("="*50)
        preds = []
        
        # * baseline: metric = 'F1'
        # model = CatBoostClassifier(iterations=iterations,random_state=SEED,task_type="GPU",eval_metric="F1",cat_features=cat_features,one_hot_max_size=4)

        # * ver.2 : metric = 'AUC'
        model = CatBoostClassifier(
            iterations=iterations, 
            random_state=SEED, 
            task_type="GPU", 
            eval_metric="AUC", 
            cat_features=cat_features, 
            one_hot_max_size=4, 
            # learning_rate=0.075
            )

        model.fit(x_train.iloc[tri], y_train[tri], 
                eval_set=[(x_train.iloc[vai], y_train[vai])], 
                early_stopping_rounds=patience,
                verbose = 1000
            )

        models.append(model)
        # scores_F1.append(model.get_best_score()["validation"]["F1"])
        scores_F1.append(f1_score(y_train[vai], model.predict(x_train.iloc[vai])))
        scores_AUC.append(model.get_best_score()["validation"]["AUC"])
        
        if is_holdout:
            break    
    
    # * Append Results
    scores_F1_total[f'{name_}'] = scores_F1
    scores_AUC_total[f'{name_}'] = scores_AUC
    models_total[f'{name_}'] = models
    

 * feature-group : CB1_feats
 * The # of - Total feats: 35 = Cat feats: 31 + Num feats: 4 
Learning rate set to 0.016489
0:	learn: 0.6090565	test: 0.6102459	best: 0.6102459 (0)	total: 34.1ms	remaining: 5m 40s
1000:	learn: 0.6442518	test: 0.6410028	best: 0.6410028 (1000)	total: 30.5s	remaining: 4m 34s
2000:	learn: 0.6526265	test: 0.6438344	best: 0.6438344 (2000)	total: 1m	remaining: 4m 1s
3000:	learn: 0.6589061	test: 0.6449666	best: 0.6449691 (2997)	total: 1m 29s	remaining: 3m 28s
4000:	learn: 0.6647154	test: 0.6457004	best: 0.6457031 (3994)	total: 1m 59s	remaining: 2m 58s
5000:	learn: 0.6701416	test: 0.6461352	best: 0.6461352 (5000)	total: 2m 29s	remaining: 2m 28s
bestTest = 0.6461516023
bestIteration = 5025
Shrink model to first 5026 iterations.
Learning rate set to 0.016489
0:	learn: 0.6085952	test: 0.6080912	best: 0.6080912 (0)	total: 33.7ms	remaining: 5m 36s
1000:	learn: 0.6446623	test: 0.6393586	best: 0.6393586 (1000)	total: 30.4s	remaining: 4m 32s
2000:	learn: 0.6529219	test: 0.6

# CV 결과 확인

In [68]:
# scores_F1_total
# scores_AUC_total
# models_total

In [34]:
gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"]

for name_ in gp_names:
    print("="*50)
    print(f" * feature-group : {name_}")
    
    scores_F1 = scores_F1_total[name_].copy()
    scores_AUC = scores_AUC_total[name_].copy()
    # -------------------------------------------------
    print(f" * scores_F1 : {np.round(scores_F1, 6)}")
    print(f" * The Avg. of F1 scores : {np.mean(scores_F1).round(6)}")

    print(f" * scores_AUC : {np.round(scores_AUC, 6)}")
    print(f" * The Avg. of AUC scores : {np.mean(scores_AUC).round(6)}")


 * feature-group : CB1_feats
 * scores_F1 : [0.619827 0.618949 0.617994 0.617395 0.618025]
 * The Avg. of F1 scores : 0.618438
 * scores_AUC : [0.646152 0.64394  0.644987 0.642109 0.644194]
 * The Avg. of AUC scores : 0.644276
 * feature-group : CB2_feats
 * scores_F1 : [0.592573 0.593317 0.596608 0.593911 0.592455]
 * The Avg. of F1 scores : 0.593773
 * scores_AUC : [0.607127 0.606583 0.607472 0.60355  0.605518]
 * The Avg. of AUC scores : 0.60605
 * feature-group : CB3_feats
 * scores_F1 : [0.656975 0.656238 0.656526 0.651421 0.651017]
 * The Avg. of F1 scores : 0.654435
 * scores_AUC : [0.715667 0.716109 0.714681 0.713656 0.712608]
 * The Avg. of AUC scores : 0.714544


### 최적 Threshold 값 탐색 

In [40]:
# for m_ in models_total: 
#     print(m_)
    
models_total['CB1_feats']

[<catboost.core.CatBoostClassifier at 0x7f4f69e8cdd0>,
 <catboost.core.CatBoostClassifier at 0x7f4f69f89e90>,
 <catboost.core.CatBoostClassifier at 0x7f4f69e88bd0>,
 <catboost.core.CatBoostClassifier at 0x7f4f69e836d0>,
 <catboost.core.CatBoostClassifier at 0x7f4f69f89f50>]

In [97]:
gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"]
gp_feats = [CB1_feats, CB2_feats, CB3_feats]

threshold_total = {}
score_total = {}
proba_total = {}

thres_range = range(10, 90, 5)

for name_, col_group_ in zip(gp_names, gp_feats):    
    # * dataset partitioning  
    x_train = x_train_0[col_group_].copy()
    x_test  = x_test_0[col_group_].copy()
    y_train = y_train_0.copy()
    
    models = models_total[name_].copy() 
    # -------------------------------------------------
    thres_bucket = [] # * bucket for "Best-Threshold" for each fold
    score_bucket = [] # * bucket for "Best-F1" for each fold"
    proba_bucket = [] # * bucket for "test-pred_proba" for each-fold
    
    # * Optimal Threshold Search - lv0 (threshold : from 0.1 to 0.85)
    for i,(tri, vai) in enumerate(cv.split(x_train)):
        pred_proba = models[i].predict_proba(x_train.iloc[vai])[:, 1]
        sco_by_thres_ = [] 
        for thrs_ in thres_range: 
            threshold = thrs_/100
            pred = np.where(pred_proba >= threshold , 1, 0)
            score = f1_score(y_train[vai], pred)

            sco_by_thres_.append(score)
        thrs_lv0 = pd.Series(sco_by_thres_, index=thres_range).idxmax()
    
    # * Optimal Threshold Search - lv1 (threshold-from-lv0 +/- 0.1)
        thres_range_lv1 = range((thrs_lv0-10)*10, (thrs_lv0+10)*10, 5)
        sco_by_thres_ = []
        for thrs_ in thres_range_lv1: 
            threshold = thrs_/1000
            pred = np.where(pred_proba >= threshold , 1, 0)
            score = f1_score(y_train[vai], pred)
            sco_by_thres_.append(score)    # < F1-scores
        
        best_thres_ser = pd.Series(sco_by_thres_, index=thres_range_lv1)
        # thrs_lv1 = best_thres_ser.idxmax()    # < best threshold
        # f1_lv1 = best_thres_ser.max()         # < best F1 - by best threshold
    
    # * get_prob_of_testset
        test_pred_proba = models[i].predict_proba(x_test)[:, 1]
        
        thres_bucket.append(np.round(best_thres_ser.idxmax()/1000, 3)) # < best threshold
        score_bucket.append(best_thres_ser.max()) # < best F1 - by best threshold
        proba_bucket.append(test_pred_proba) # < "test-pred_proba" for each-fold
        
    threshold_total[f"{name_}"] = thres_bucket
    score_total[f"{name_}"] = score_bucket
    proba_total[f"{name_}"] = proba_bucket

In [98]:
# threshold_total

In [99]:
# score_total

In [100]:
# proba_total

### threshold 정의 & best-F1 score 평균 

In [101]:
best_threshold_total = {name_: np.mean(thes_list_).round(4) for name_, thes_list_ in threshold_total.items()}
print(f" * Threshold for each variants: {best_threshold_total}")

# --------------------------------------------------- 

gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"]

for name_ in gp_names:
    print("="*50)
    print(f" * feature-group : {name_}")
    scores_F1_prev = scores_F1_total[name_].copy()    
    scores_F1_adjs = score_total[name_].copy()
    # -------------------------------------------------
    print(f" * scores_F1 - previous : {np.round(scores_F1_prev, 6)}")
    print(f" * scores_F1 - adjusted : {np.round(scores_F1_adjs, 6)}")
    print(f" * The Avg. of F1 - previous : {np.mean(scores_F1_prev).round(4)}  ->  adjusted : {np.mean(scores_F1_adjs).round(4)}")


 * Threshold for each variants: {'CB1_feats': 0.344, 'CB2_feats': 0.338, 'CB3_feats': 0.348}
 * feature-group : CB1_feats
 * scores_F1 - previous : [0.619827 0.618949 0.617994 0.617395 0.618025]
 * scores_F1 - adjusted : [0.683717 0.68235  0.681472 0.681878 0.679754]
 * The Avg. of F1 - previous : 0.6184  ->  adjusted : 0.6818
 * feature-group : CB2_feats
 * scores_F1 - previous : [0.592573 0.593317 0.596608 0.593911 0.592455]
 * scores_F1 - adjusted : [0.670466 0.670582 0.667999 0.669412 0.668046]
 * The Avg. of F1 - previous : 0.5938  ->  adjusted : 0.6693
 * feature-group : CB3_feats
 * scores_F1 - previous : [0.656975 0.656238 0.656526 0.651421 0.651017]
 * scores_F1 - adjusted : [0.70721  0.706155 0.705719 0.706063 0.704415]
 * The Avg. of F1 - previous : 0.6544  ->  adjusted : 0.7059


### 최종 Pred

In [167]:
# 

# 1 ) each feat_variants: => Mean(proba) 
gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"]

proba_by_variants = {n_: np.mean(proba_total[n_], axis=0) for n_ in gp_names} 
# ------------------------
# >>> proba_by_variants 
# {'CB1_feats': array([0.45660201, 0.4630387 , 0.45793774, ..., 0.59621287, 0.69742711,
#         0.63699568]),
#  'CB2_feats': array([0.51995581, 0.43018918, 0.50929319, ..., 0.53231262, 0.63724508,
#         0.62969929]),
#  'CB3_feats': array([0.34450704, 0.31392938, 0.40045779, ..., 0.66538393, 0.66460025,
#         0.71595655])}
# ------------------------

# 1-> pred) 
pred_by_variants = {n_: np.where(proba_by_variants[n_]>=best_threshold_total[n_], 1, 0) for n_ in gp_names} 
# ------------------------
# >>> pred_by_variants
# {'CB1_feats': array([1, 1, 1, ..., 1, 1, 1]),
#  'CB2_feats': array([1, 1, 1, ..., 1, 1, 1]),
#  'CB3_feats': array([0, 0, 1, ..., 1, 1, 1])}
# ------------------------

# 1 -> 2:proba -> pred ) 
proba = np.mean([proba_ for proba_ in proba_by_variants.values()], axis=0)
best_threshold_ = np.mean([thres_ for thres_ in best_threshold_total.values()])
pred = np.where(proba>=np.round(best_threshold_, 4), 1, 0)
# print(f" * best_threshold : {np.round(best_threshold_, 4)} ")
# ------------------------
# >>> proba
# array([0.44035495, 0.40238575, 0.45589624, ..., 0.59796981, 0.66642415,
#        0.66088384])
# >>> pred
# array([1, 1, 1, ..., 1, 1, 1])
#  * best_threshold : 0.3433 
# ------------------------

# 1-> pred -> votting_pred) 
pred_sum = np.sum([pred_ for pred_ in pred_by_variants.values()], axis=0)
pred_by_vote = np.where(pred_sum>=len(gp_names)/2, 1, 0)
# ------------------------
# >>> pred_by_vote
# array([1, 1, 1, ..., 1, 1, 1])
# ------------------------


In [170]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')
sample_submission['target'] = proba_by_variants['CB1_feats']
sample_submission
# proba_by_variants['CB1_feats']

Unnamed: 0,id,target
0,0,0.456602
1,1,0.463039
2,2,0.457938
3,3,0.563599
4,4,0.582852
...,...,...
46399,46399,0.556573
46400,46400,0.500873
46401,46401,0.596213
46402,46402,0.697427


In [168]:
pred

array([1, 1, 1, ..., 1, 1, 1])

### * 제출 파일 

In [171]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')

gp_names = ["CB1_feats", "CB2_feats", "CB3_feats"] 

for name_ in gp_names: 
    # * save proba ) 
    subm_df = sample_submission.copy()
    subm_df['target'] = proba_by_variants[name_]
    subm_df.to_csv(SUBMIT_PATH+f'CatBoost_model8_{name_}_proba.csv', index=False)

    # * save pred ) 
    subm_df = sample_submission.copy()
    subm_df['target'] = pred_by_variants[name_]
    subm_df.to_csv(SUBMIT_PATH+f'CatBoost_model8_{name_}_pred.csv', index=False)



In [172]:
sample_submission = pd.read_csv(SUBMIT_PATH+f'sample_submission.csv')

name_total_ = 'CB'

# * save proba ) 
subm_df = sample_submission.copy()
subm_df['target'] = proba
subm_df.to_csv(SUBMIT_PATH+f'CatBoost_model8_{name_total_}_proba.csv', index=False)

# * save pred ) 
subm_df = sample_submission.copy()
subm_df['target'] = pred
subm_df.to_csv(SUBMIT_PATH+f'CatBoost_model8_{name_total_}_pred.csv', index=False)



#### * Submission History 
------------------

In [None]:
res_ = """
* Threshold for each variants: {'CB1_feats': 0.344, 'CB2_feats': 0.338, 'CB3_feats': 0.348}
==================================================
 * feature-group : CB1_feats
 * scores_F1 - previous : [0.619827 0.618949 0.617994 0.617395 0.618025]
 * scores_F1 - adjusted : [0.683717 0.68235  0.681472 0.681878 0.679754]
 * The Avg. of F1 - previous : 0.6184  ->  adjusted : 0.6818              -> Test : 0.6818-894732
==================================================
 * feature-group : CB2_feats
 * scores_F1 - previous : [0.592573 0.593317 0.596608 0.593911 0.592455]
 * scores_F1 - adjusted : [0.670466 0.670582 0.667999 0.669412 0.668046]
 * The Avg. of F1 - previous : 0.5938  ->  adjusted : 0.6693              -> Test : 0.6710-332843	
==================================================
 * feature-group : CB3_feats
 * scores_F1 - previous : [0.656975 0.656238 0.656526 0.651421 0.651017]
 * scores_F1 - adjusted : [0.70721  0.706155 0.705719 0.706063 0.704415]
 * The Avg. of F1 - previous : 0.6544  ->  adjusted : 0.7059              -> Test : 0.6941-091509	
 """

print(res_)


* Threshold for each variants: {'CB1_feats': 0.344, 'CB2_feats': 0.338, 'CB3_feats': 0.348}
 * feature-group : CB1_feats
 * scores_F1 - previous : [0.619827 0.618949 0.617994 0.617395 0.618025]
 * scores_F1 - adjusted : [0.683717 0.68235  0.681472 0.681878 0.679754]
 * The Avg. of F1 - previous : 0.6184  ->  adjusted : 0.6818              -> Test : 0.6818-894732
 * feature-group : CB2_feats
 * scores_F1 - previous : [0.592573 0.593317 0.596608 0.593911 0.592455]
 * scores_F1 - adjusted : [0.670466 0.670582 0.667999 0.669412 0.668046]
 * The Avg. of F1 - previous : 0.5938  ->  adjusted : 0.6693              -> Test : 0.6710-332843	
 * feature-group : CB3_feats
 * scores_F1 - previous : [0.656975 0.656238 0.656526 0.651421 0.651017]
 * scores_F1 - adjusted : [0.70721  0.706155 0.705719 0.706063 0.704415]
 * The Avg. of F1 - previous : 0.6544  ->  adjusted : 0.7059              -> Test : 0.6941-091509	
 
