In [5]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import json, os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from catboost import CatBoostClassifier, Pool


from utils import seed_everything, get_config, print_score
from preprocess import feature_engineering

# 기본 프로젝트 디렉터리
PROJECT_PATH = "/opt/ml/online_retail_prediction"

# 데이터가 저장된 디렉터리
BASE_DATA_PATH = "/opt/ml/online_retail_prediction/input"


class CFG:
    objective = "binary"
    boosting_type = "gbdt"
    metric = "auc"
    feature_fraction = 0.8
    bagging_fraction = 0.8
    bagging_freq = 1
    n_estimators = 10000
    early_stopping_rounds = 100
    verbose = -1
    n_jobs = -1

    predict_year_month = '2011-12'
    seed = 42 # random seed
    total_thres = 300 # total threshold
    folds = 10 # number of k-fold
    model = 'CatBoost' # model
    description = 'Modeling' # description

    train_data_path = os.path.join(BASE_DATA_PATH, 'train.csv') # train csv 파일
    sample_submission_path = os.path.join(BASE_DATA_PATH, 'sample_submission.csv') # train csv 파일
    config_path = './config/config.json'
    docs_path = os.path.join(PROJECT_PATH, 'docs') # result, visualization 저장 경로
    model_path = os.path.join(PROJECT_PATH, 'models') # trained model 저장 경로


def get_data():
    # 데이터 파일 읽기
    data = pd.read_csv(CFG.train_data_path, parse_dates=['order_date'])
    return data


def get_from_dataset():
    train = pd.read_csv(os.path.join(CFG.docs_path,'dataset','train.csv'))
    test = pd.read_csv(os.path.join(CFG.docs_path,'dataset','test.csv'))
    features = train.drop(columns=['customer_id', 'label', 'year_month']).columns
    y = train['label']

    return train, test, y, features


def train_model(x_tr, y_tr, x_val, y_val,model_params,categorical_features):
    if model_params == None:
        model_params = {
            "n_estimators": CFG.n_estimators,
            "early_stopping_rounds": CFG.early_stopping_rounds,
            "eval_metric": "AUC",
            "loss_function": "Logloss",
            "random_seed": CFG.seed,
            "use_best_model": True,
            "verbose": False,
            # "learning_rate": 0.1,
            # "bootstrap_type": "Bernoulli",
            # 'learning_rate': 0.03740217342845925,
            # 'depth': 8,
            # 'subsample': 0.8725078356449699,
            # 'rsm': 0.15,
        }
    
    model = CatBoostClassifier(**model_params)
    model.fit(
        x_tr, y_tr,
        eval_set=(x_val, y_val),
        use_best_model = True,
        cat_features=categorical_features,
    )

    return model


def make_cat_oof_prediction(train, y, test, features, categorical_features, model_params, folds=10):
    x_train = train[features]
    x_test = test[features]
    
    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])
    
    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])
    
    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0
    
    # 피처 중요도를 저장할 데이터 프레임 선언
    fi = pd.DataFrame()
    fi['feature'] = features
    
    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=CFG.seed)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')
        
        # Catboost 모델 훈련
        model = train_model(x_tr, y_tr, x_val, y_val,model_params,categorical_features)

        # Validation 데이터 예측
        val_preds = np.array(model.predict_proba(x_val))[:,1]
        
        # Validation index에 예측값 저장 
        y_oof[val_idx] = val_preds
        
        # 폴드별 Validation 스코어 측정
        print(f"Fold {fold + 1}")
        print_score(y_val, val_preds)
        # print(f"parameters : \n{model.get_all_params()}")
        print('-'*80)

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds
        
        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += np.array(model.predict_proba(x_test))[:,1] / folds
        
        # 폴드별 피처 중요도 저장
        fi[f'fold_{fold+1}'] = model.get_feature_importance()


        del x_tr, x_val, y_tr, y_val
        gc.collect()
        
    print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력

    # ROC curve
    fpr, tpr, thresholds = roc_curve(y, y_oof)

    plt.plot(fpr, tpr, linewidth=2)
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('FPR = 1 - TNR')
    plt.ylabel('TPR = Recall')
    plt.savefig(os.path.join(CFG.docs_path, 'ROC_curce_oof.png'))
        
    # 폴드별 피처 중요도 평균값 계산해서 저장 
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis=1)

    return y_oof, test_preds, fi, score


def inference(test_preds):
    # 테스트 결과 제출 파일 읽기
    sub = pd.read_csv(CFG.sample_submission_path)
    
    # 테스트 예측 결과 저장
    sub['probability'] = test_preds
    
    # 제출 파일 쓰기
    sub.to_csv(os.path.join(CFG.docs_path ,'result', 'output.csv'), index=False)

In [6]:
# config.json parsing
get_config(CFG)

# fix seed for reproducible model
seed_everything(CFG.seed)

# train.csv data 가져오기
data = get_data()

# 피처 엔지니어링 실행 및 Dataset 저장
# train_data, test_data = feature_engineering(data, CFG.predict_year_month, CFG.total_thres)
# train_data.to_csv(os.path.join(CFG.docs_path ,'dataset', 'train.csv'), index=False)
# test_data.to_csv(os.path.join(CFG.docs_path ,'dataset', 'test.csv'), index=False)
cate_cols = []

# 저장된 Dataset 불러오기
train, test, y, features = get_from_dataset()

# Cross Validation Out Of Fold로 catboost 모델 훈련 및 예측
# y_oof, test_preds, fi = make_cat_oof_prediction(train, y, test, features, folds=CFG.folds)

# 테스트 결과 만들기
# inference(test_preds)

## Optuna Objective Function 정의

In [18]:
def objective(trial):
    model_params = {
            "n_estimators": CFG.n_estimators,
            "early_stopping_rounds": CFG.early_stopping_rounds,
            "eval_metric": 'AUC',
            "loss_function": 'Logloss',
            "random_seed": CFG.seed,
            "use_best_model": True,
            "verbose": False,
            "bootstrap_type": "Bernoulli",
            
        
#             "max_leaves": trial.suggest_int('max_leaves', 64, 256), # num_leaves 값을 2-256까지 정수값 중에 사용
#             "max_bin": trial.suggest_int('max_bin', 128, 256), # max_bin 값을 128-256까지 정수값 중에 사용
            "learning_rate": trial.suggest_uniform('learning_rate', 0.03, 0.07), # lr 값을 0.035 ~ 0.045까지 uniform 분포로 사용
#             "depth": trial.suggest_int('depth', 6, 8),
            "subsample": trial.suggest_uniform('subsample', 0.5, 1.00),
#             "rsm": trial.suggest_uniform('rsm', 0.5, 1.00),
#             "l2_leaf_reg": trial.suggest_uniform('l2_leaf_reg', 0.1, 5.0),
#             "bagging_temperature": trial.suggest_int('bagging_temperature', 0, 1000),
        }
    
    # oof prediction 함수 호출해서 out of fold validation 예측값을 얻어옴
    y_oof, test_preds, fi, score = make_cat_oof_prediction(train, y, test, features, folds=CFG.folds, model_params=model_params, categorical_features=cate_cols)
    
    return score

## Use optuna to maximize validation score

In [None]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100) # 10회 동안 하이퍼 파라미터 탐색

## Get Tuning Results

In [20]:
study.trials_dataframe().sort_values("value", ascending=False)[:30]

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_subsample,state
59,59,0.840993,2021-04-22 11:08:30.978762,2021-04-22 11:10:43.677009,0 days 00:02:12.698247,0.035713,0.986727,COMPLETE
31,31,0.840603,2021-04-22 10:24:56.651362,2021-04-22 10:27:00.916513,0 days 00:02:04.265151,0.040953,0.711796,COMPLETE
71,71,0.839367,2021-04-22 11:29:11.293470,2021-04-22 11:30:57.165123,0 days 00:01:45.871653,0.034692,0.942298,COMPLETE
27,27,0.839082,2021-04-22 10:19:29.986263,2021-04-22 10:20:44.171772,0 days 00:01:14.185509,0.069721,0.99033,COMPLETE
0,0,0.839032,2021-04-22 09:36:52.640896,2021-04-22 09:38:33.504970,0 days 00:01:40.864074,0.042751,0.705116,COMPLETE
78,78,0.838743,2021-04-22 11:41:12.808810,2021-04-22 11:42:58.019583,0 days 00:01:45.210773,0.037719,0.997507,COMPLETE
54,54,0.838735,2021-04-22 11:00:38.988260,2021-04-22 11:02:01.274477,0 days 00:01:22.286217,0.044891,0.953997,COMPLETE
81,81,0.838724,2021-04-22 11:46:03.494658,2021-04-22 11:47:40.568273,0 days 00:01:37.073615,0.039192,0.949893,COMPLETE
63,63,0.838671,2021-04-22 11:16:03.105071,2021-04-22 11:17:44.824586,0 days 00:01:41.719515,0.034372,0.983635,COMPLETE
53,53,0.838427,2021-04-22 10:58:55.137436,2021-04-22 11:00:38.987351,0 days 00:01:43.849915,0.040578,0.849717,COMPLETE
