## dataload

In [3]:
import mlflow
import mlflow.lightgbm
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import pandas as pd
import os
import random
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import eli5
from eli5.sklearn import PermutationImportance
from shap import TreeExplainer, Explanation
from shap.plots import waterfall
import sys
sys.path.append('../LGBM')
from utils import custom_train_test_split, lgbm_predict, post_slack, title2filename
from xgboost import XGBClassifier
os.environ["LOGNAME"]='cwj'
SEED=13

## 1. 데이터 로딩
data_dir = '/opt/ml/input/data' # 경로
after_fe_path = os.path.join(data_dir, 'base_lgbm2.pkl')
df = pd.read_pickle(after_fe_path)

train_df = df[df.kind=='train']
train, valid1, valid2 = custom_train_test_split(train_df, ratio=0.7, seed=SEED) # 훈련데이터 split
test = df[df.kind=='test'] # 테스트 데이터
train2 = test[test.answerCode!=-1] # 테스트데이터 마지막 제출 2번쨰꺼까지 훈련데이터로 사용
train = pd.concat([train,train2]) # 훈련데이터 병합
train.shape, valid1.shape, valid2.shape, test.shape

x_train = train.drop('answerCode',axis=1)
y_train = train[['answerCode']]

x_valid1 = valid1.drop('answerCode',axis=1)
y_valid1 = valid1[['answerCode']]

x_valid2 = valid2.drop('answerCode',axis=1)
y_valid2 = valid2[['answerCode']]

x_train.shape, y_train.shape, x_valid1.shape, y_valid1.shape, x_valid2.shape, y_valid2.shape

((1845539, 87), (680417, 87), (1974, 87), (260114, 87))

((1845539, 86), (1845539, 1), (680417, 86), (680417, 1), (1974, 86), (1974, 1))

In [None]:
df.columns

In [4]:
# MLflow 연결
remote_server_uri="http://118.67.134.110:30005"
mlflow.set_tracking_uri(remote_server_uri)
client = mlflow.tracking.MlflowClient()
experiment_name = "XGBM_parameter_opt"
try:
    experiment_id = client.create_experiment(experiment_name)
except:
    experiment = client.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
experiment_id

'625714156968354363'

In [4]:
df[:3]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,kind,uidIdx,assIdx,testIdx,...,mid_category_solvesec_o,mid_category_solvesec_x,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,train,0,5354,975,...,470.865257,374.856236,7,2,0.0,0,0.142857,1,0,0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,train,0,5355,975,...,470.865257,374.856236,7,2,1.386294,0,0.285714,2,0,0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,train,0,5356,975,...,470.865257,374.856236,7,2,2.197225,1,0.428571,3,0,0


## XGBM 학습 및 예측

In [5]:
### 피처 설정
# 사용할 Feature 설정
xgbm_FEATS = ['uidIdx',
         'assIdx',
         'testIdx',
         'KnowledgeTag',
         'user_correct_answer',
         'user_total_answer',
         'big_category',
         'mid_category',
         'problem_num',
         'month','day','dayname','hour',
         'test_mean',
         'test_sum',
         'test_std',
         'tag_std',
         'tag_mean',
         'tag_sum',
         'solvesec_3600',
         'time_category',
         'solvesec_cumsum',
         'solvecumsum_category',
         'big_category_cumconut',
         'big_category_user_cum_acc',
         'mid_category_cumconut',
         'mid_category_user_cum_acc',
         'assess_count',
         'elo_assessmentItemID',
         'elo_problem_num',
        ]

xgbm_cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','big_category','mid_category',
             'problem_num','dayname','month','time_category','solvecumsum_category']
xgbm_cat_feats_idx = [i for i,e in enumerate(xgbm_FEATS) if e in xgbm_cat_feats]

xgbm_init_params = {
    "learning_rate": 0.023,
    "n_estimators" : 500,
}

xgbm_fit_params = {
    "categorical_feature" : xgbm_cat_feats_idx
}

## optuna
### 하이퍼 파라미터
- optuna.trial.Trial.suggest_categorical() : 리스트 범위 내에서 값을 선택한다.
- optuna.trial.Trial.suggest_int() : 범위 내에서 정수형 값을 선택한다.</br>
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
- optuna.trial.Trial.sugges|t_float() : 범위 내에서 소수형 값을 선택한다.</br>
"max_depth": trial.suggest_int("max_depth", 1, 20),
- optuna.trial.Trial.suggest_uniform() : 범위 내에서 균일분포 값을 선택한다.
- optuna.trial.Trial.suggest_discrete_uniform() : 범위 내에서 이산 균일분포 값을 선택한다.
- optuna.trial.Trial.suggest_loguniform() : 범위 내에서 로그 함수 값을 선택한다.


In [13]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

def objective(trial: Trial) -> float:
    ### 피처 설정
# 사용할 Feature 설정
    xgbm_FEATS = ['uidIdx','assIdx','testIdx','KnowledgeTag',
                  'user_correct_answer',
                  'user_total_answer',
                  'month','day','dayname','hour',
                  'test_mean','test_sum','test_std',
                  'tag_mean',
                  'tag_sum','tag_std',
                  'solvesec_3600', 'time_category',
                  'solvesec_cumsum', 'solvecumsum_category',

                  'big_category',
                  'big_category_cumconut',
                  'big_category_user_cum_acc', 'big_category_answer',
                  'big_category_cum_solvesec', 'big_category_mean_solvesec',
                  'bg_category_solvesec', 'bg_category_solvesec_o', 'bg_category_solvesec_x',

                  'mid_category',
                  'mid_category_cumconut', 'mid_category_user_cum_acc',

                  'assess_count',
                  'elo_assessmentItemID','elo_problem_num',
                  'ass_solvesec', 'ass_solvesec_o', 'ass_solvesec_x',

                  'problem_num',
                  'problem_num_solvesec',
                  'problem_num_solvesec_o','problem_num_solvesec_x']

    xgbm_cat_feats = ['uidIdx','assIdx','testIdx','KnowledgeTag','big_category','mid_category',
                 'problem_num','dayname','month','time_category','solvecumsum_category']
    xgbm_cat_feats_idx = [i for i,e in enumerate(xgbm_FEATS) if e in xgbm_cat_feats]

    xgbm_init_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators" : 500,
        "colsample_bytree":trial.suggest_float("colsample_bytree", 0.5, 1),
    }

    xgbm_fit_params = {
        "categorical_feature" : xgbm_cat_feats_idx
    }
    model = XGBClassifier(**xgbm_init_params)

    mlflow.xgboost.autolog()

    date = datetime.now().strftime('%m/%d %a')
    title=f"🌈({date})[XGBM Test] 피처: {len(xgbm_FEATS)}개"
    using_feats=", ".join(xgbm_FEATS)
    desc=f"사용된 피처({len(xgbm_FEATS)})\n{using_feats}"

    with mlflow.start_run(run_name=title, description=desc,experiment_id=experiment_id) as run:
        model = XGBClassifier(**xgbm_init_params)
        model.fit(
            x_train[xgbm_FEATS],
            y_train,
            eval_set=[(x_valid2[xgbm_FEATS],y_valid2)],
            eval_metric="auc",
            verbose=100,
        )
        preds = model.predict(x_valid2[xgbm_FEATS])
        acc = accuracy_score(y_valid2, np.where(preds >= 0.5, 1, 0))
        auc2 = roc_auc_score(y_valid2, preds)
        print(f'VALID AUC : {auc2}, ACC : {acc}\n')
        mlflow.log_metric("VAL AUC",auc2)
        mlflow.log_metric("VAL Set SEED",SEED)
        file_name = title2filename(title)
        test_preds = lgbm_predict(test, model, xgbm_FEATS, f'{file_name}.csv')
    
    return auc2    
        
# q = pd.Series(test_preds).plot(kind='kde')

In [None]:
from optuna.integration.mlflow import MLflowCallback
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="XGBM_parameter_opt",
    direction="maximize",
    sampler=sampler,
)

print("Optimize a model...")


remote_server_uri="http://118.67.134.110:30005"
def make_mlflow_callback():
    cb = MLflowCallback(
         tracking_uri = mlflow.get_tracking_uri(),
         metric_name="VAL AUC",
         create_experiment=True)
    return cb

mlflow_cb = make_mlflow_callback()

study.optimize(
    objective, n_trials = 50, callbacks=[mlflow_cb], show_progress_bar=True
)

print(f'study.best_trial.value : {study.best_trial.value}') 
# objective의 return값이 가장 "maximize"된 파라미터 값
print(f'study.best_params : {study.best_params}')


[32m[I 2022-12-08 06:07:00,310][0m A new study created in memory with name: XGBM_parameter_opt[0m


Optimize a model...


  0%|          | 0/50 [00:00<?, ?it/s]

[0]	validation_0-auc:0.80410
[100]	validation_0-auc:0.82263
[200]	validation_0-auc:0.82773
[300]	validation_0-auc:0.82998
[400]	validation_0-auc:0.83130


In [7]:
perm = PermutationImportance(
    model, scoring="roc_auc", n_iter=1, random_state=42, cv=None, refit=False
).fit(x_valid2[xgbm_FEATS], y_valid2)
eli5.show_weights(perm, top=len(xgbm_FEATS), feature_names=xgbm_FEATS)

Weight,Feature
0.2122  ± 0.0000,elo_assessmentItemID
0.0622  ± 0.0000,solvesec_3600
0.0419  ± 0.0000,mid_category_user_cum_acc
0.0181  ± 0.0000,big_category_user_cum_acc
0.0161  ± 0.0000,assIdx
0.0144  ± 0.0000,elo_problem_num
0.0028  ± 0.0000,test_mean
0.0012  ± 0.0000,big_category_cumconut
0.0011  ± 0.0000,uidIdx
0.0010  ± 0.0000,user_correct_answer
