In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, roc_curve

import sys, os
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)
from dataloader.data_loader import data_loader  # 경로를 dataloader 폴더 기준으로 지정
from model.train import train_model, test, final_train_model

# Data Load

In [46]:
df_select_ma_shift = pd.read_csv('../data/train_yh.csv')

# Model Training

## 상승,하락 예측
- 상승,하락 예측 prob을 새로운 변수로 활용

### 0. train valid split

In [47]:
# 타겟 변수 생성 (예: 4개 클래스로 분류)
df_select_ma_shift['target_direction'] = np.where(df_select_ma_shift['target'] >= 2, 1, 0)

In [48]:
train_df = df_select_ma_shift[df_select_ma_shift._type == 'train'].drop(columns = '_type')
test_df = df_select_ma_shift[df_select_ma_shift._type == 'test'].drop(columns = '_type')

In [49]:
drop_colunm = ["target", "ID", "target_direction"]
target_colunm = 'target_direction'
x_train_random, x_valid_random, y_train_random, y_valid_random = data_split("random", train_df, drop_colunm, target_colunm) # random split
x_train_ts, x_valid_ts, y_train_ts, y_valid_ts = data_split("time", train_df, drop_colunm, target_colunm) # time series split

### 1. XGBoost
- random_accuracy = 0.5199
- ts_accuracy = 0.5403

In [52]:
import xgboost as xgb
# XGBoost params
xgb_params = {
    "objective": "binary:logistic",
    "max_depth": 6,
    "learning_rate": 0.02,
    "n_estimators": 100,
    "random_state": 42,
    'verbosity': 0
}

In [53]:
# random split version
train_data, valid_data = _Dataset('XGB', x_train_random, x_valid_random, y_train_random, y_valid_random)
xgb_random, y_pred_random, accuracy_random, auroc_random = train_model('XGB', xgb_params, x_train_random, x_valid_random, y_train_random, y_valid_random, train_data, valid_data)

# time series split version
train_data, valid_data = _Dataset('XGB', x_train_ts, x_valid_ts, y_train_ts, y_valid_ts)
xgb_ts, y_pred_ts, accuracy_ts, auroc_ts = train_model('XGB', xgb_params,x_train_ts, x_valid_ts, y_train_ts, y_valid_ts, train_data, valid_data)

[0]	validation_0-logloss:0.69258
[1]	validation_0-logloss:0.69227
[2]	validation_0-logloss:0.69195
[3]	validation_0-logloss:0.69178
[4]	validation_0-logloss:0.69140
[5]	validation_0-logloss:0.69130
[6]	validation_0-logloss:0.69120
[7]	validation_0-logloss:0.69093
[8]	validation_0-logloss:0.69076
[9]	validation_0-logloss:0.69059
[10]	validation_0-logloss:0.69051
[11]	validation_0-logloss:0.69036
[12]	validation_0-logloss:0.69006
[13]	validation_0-logloss:0.68994
[14]	validation_0-logloss:0.68992
[15]	validation_0-logloss:0.68954
[16]	validation_0-logloss:0.68952
[17]	validation_0-logloss:0.68961
[18]	validation_0-logloss:0.68939
[19]	validation_0-logloss:0.68904
[20]	validation_0-logloss:0.68933
[21]	validation_0-logloss:0.68930
[22]	validation_0-logloss:0.68944
[23]	validation_0-logloss:0.68945
[24]	validation_0-logloss:0.68957
[25]	validation_0-logloss:0.68989
[26]	validation_0-logloss:0.69009
[27]	validation_0-logloss:0.69015
[28]	validation_0-logloss:0.69033
[29]	validation_0-loglos

In [54]:
print(f"random split - acc: {accuracy_random}")
print(f"ts split - acc: {accuracy_ts}")

random split - acc: 0.5199771689497716
ts split - acc: 0.5403005464480874


### 2. LGBM
- random_accuracy = 0.5331
- ts_accuracy = 0.5389

In [55]:
# lgb params
lgb_params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "binary_logloss",
        "num_leaves": 30,
        "learning_rate": 0.02,
        "n_estimators": 30,
        "random_state": 42,
        'verbose': -1
    }

In [56]:
# random split version
train_data, valid_data = _Dataset('LGBM', x_train_random, x_valid_random, y_train_random, y_valid_random)
lgb_random, y_pred_random, accuracy_random, auroc_random = train_model('LGBM', lgb_params, x_train_random, x_valid_random, y_train_random, y_valid_random, train_data, valid_data)

# time series split version
train_data, valid_data = _Dataset('LGBM', x_train_ts, x_valid_ts, y_train_ts, y_valid_ts)
lgb_ts, y_pred_ts, accuracy_ts, auroc_ts = train_model('LGBM', lgb_params,x_train_ts, x_valid_ts, y_train_ts, y_valid_ts, train_data, valid_data)

Error calculating AUC: y should be a 1d array, got an array of shape (1752, 2) instead.
Error calculating AUC: y should be a 1d array, got an array of shape (1464, 2) instead.


In [57]:
print(f"random split - acc: {accuracy_random}")
print(f"ts split - acc: {accuracy_ts}")

random split - acc: 0.5331050228310502
ts split - acc: 0.5389344262295082


### 3. Catboost (최종)
- random_accuracy = 0.5319
- ts_accuracy = 0.5648

In [58]:
# CatBoost params
cat_params = {
    'iterations': 200,
    'learning_rate': 0.02,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'random_seed': 42,
    'verbose': 0
}

In [59]:
# random split version
train_data, valid_data = _Dataset('CatBoost', x_train_random, x_valid_random, y_train_random, y_valid_random)
cat_random, y_pred_random, accuracy_random, auroc_random = train_model('CatBoost', cat_params, x_train_random, x_valid_random, y_train_random, y_valid_random, train_data, valid_data)

# time series split version
train_data, valid_data = _Dataset('CatBoost', x_train_ts, x_valid_ts, y_train_ts, y_valid_ts)
cat_ts, y_pred_ts, accuracy_ts, auroc_ts = train_model('CatBoost', cat_params,x_train_ts, x_valid_ts, y_train_ts, y_valid_ts, train_data, valid_data)

Error calculating AUC: y should be a 1d array, got an array of shape (1752, 2) instead.
Error calculating AUC: y should be a 1d array, got an array of shape (1464, 2) instead.


In [61]:
print(f"random split - acc: {accuracy_random}")
print(f"ts split - acc: {accuracy_ts}")

random split - acc: 0.5319634703196348
ts split - acc: 0.5648907103825137


### dir_prob 변수 생성

In [62]:
# random split 버전 
dir_prob_train = cat_random.predict_proba(x_train_random)[:,1]
dir_prob_valid = cat_random.predict_proba(x_valid_random)[:,1]
dir_prob_random = np.concatenate((dir_prob_train,dir_prob_valid))

In [63]:
# 테스트셋용 
x_test_full = test_df.drop(columns = ['ID', 'target', 'target_direction'])
dir_prob_random_test = cat_random.predict_proba(x_test_full)[:,1] # random 용

In [70]:
# train_df['dir_prob_ts'] = dir_prob_ts
train_df['dir_prob'] = dir_prob_random
test_df['dir_prob'] = dir_prob_random_test

## 최종 예측

### 0. train valid split

In [71]:
drop_colunm = ["target", "ID", "target_direction"]
target_colunm = 'target'
x_train_cv, x_valid_cv, y_train_cv, y_valid_cv = data_split("randomcv", train_df, drop_colunm, target_colunm)

### 1. LGBM
- cv accuracy = 0.4296

In [82]:
lgb_params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 97,
    "learning_rate": 0.015732043600075817,
    "n_estimators": 52,
    "random_state": 42,
    "verbose": 0,
}

In [74]:
from tqdm import tqdm
acc_lst_cat = []
auroc_lst_cat = []
for i in tqdm(range(5)):
    x_train, x_valid, y_train, y_valid = x_train_cv[i], x_valid_cv[i], y_train_cv[i], y_valid_cv[i]
    train_data, valid_data = _Dataset('LGBM', x_train, x_valid, y_train, y_valid)
    catboost_model, y_valid_pred, accuracy, auroc = train_model('LGBM', lgb_params, x_train, x_valid, y_train, y_valid, train_data, valid_data)
    acc_lst_cat.append(accuracy)
    auroc_lst_cat.append(auroc)

100%|██████████| 5/5 [02:50<00:00, 34.07s/it]


In [75]:
print(f"- cv acc: {np.mean(acc_lst_cat)}, auroc: {np.mean(auroc_lst_cat)}")

- cv acc: 0.42968036529680365, auroc: 0.6142678576181666


### 2. XGBoost (최종)
- cv accuracy = 0.4496

In [76]:
xgb_params = {
        "learning_rate":0.03698033367974952,
        "max_depth" : 3,
        "n_estimators" : 50,
        "num_class" : 4,
        "random_state": 42,                          
        "verbosity": 0,
        "objective": "multi:softprob",                
        "num_class": 4,  
}

In [77]:
from tqdm import tqdm
acc_lst_cat = []
auroc_lst_cat = []
for i in tqdm(range(5)):
    x_train, x_valid, y_train, y_valid = x_train_cv[i], x_valid_cv[i], y_train_cv[i], y_valid_cv[i]
    train_data, valid_data = _Dataset('XGB', x_train, x_valid, y_train, y_valid)
    catboost_model, y_valid_pred, accuracy, auroc = train_model('XGB', xgb_params, x_train, x_valid, y_train, y_valid, train_data, valid_data)
    acc_lst_cat.append(accuracy)
    auroc_lst_cat.append(auroc)

  0%|          | 0/5 [00:00<?, ?it/s]

[0]	validation_0-mlogloss:1.37489
[1]	validation_0-mlogloss:1.36398
[2]	validation_0-mlogloss:1.35368
[3]	validation_0-mlogloss:1.34393
[4]	validation_0-mlogloss:1.33473
[5]	validation_0-mlogloss:1.32581
[6]	validation_0-mlogloss:1.31719
[7]	validation_0-mlogloss:1.30915
[8]	validation_0-mlogloss:1.30131
[9]	validation_0-mlogloss:1.29377
[10]	validation_0-mlogloss:1.28675
[11]	validation_0-mlogloss:1.28000
[12]	validation_0-mlogloss:1.27365
[13]	validation_0-mlogloss:1.26760
[14]	validation_0-mlogloss:1.26165
[15]	validation_0-mlogloss:1.25602
[16]	validation_0-mlogloss:1.25055
[17]	validation_0-mlogloss:1.24541
[18]	validation_0-mlogloss:1.24056
[19]	validation_0-mlogloss:1.23581
[20]	validation_0-mlogloss:1.23105
[21]	validation_0-mlogloss:1.22681
[22]	validation_0-mlogloss:1.22260
[23]	validation_0-mlogloss:1.21868
[24]	validation_0-mlogloss:1.21488
[25]	validation_0-mlogloss:1.21112
[26]	validation_0-mlogloss:1.20767
[27]	validation_0-mlogloss:1.20439
[28]	validation_0-mlogloss:1.2

 20%|██        | 1/5 [00:14<00:56, 14.02s/it]

[0]	validation_0-mlogloss:1.37460
[1]	validation_0-mlogloss:1.36371
[2]	validation_0-mlogloss:1.35322
[3]	validation_0-mlogloss:1.34329
[4]	validation_0-mlogloss:1.33375
[5]	validation_0-mlogloss:1.32485
[6]	validation_0-mlogloss:1.31618
[7]	validation_0-mlogloss:1.30787
[8]	validation_0-mlogloss:1.30004
[9]	validation_0-mlogloss:1.29250
[10]	validation_0-mlogloss:1.28528
[11]	validation_0-mlogloss:1.27838
[12]	validation_0-mlogloss:1.27197
[13]	validation_0-mlogloss:1.26564
[14]	validation_0-mlogloss:1.25951
[15]	validation_0-mlogloss:1.25371
[16]	validation_0-mlogloss:1.24813
[17]	validation_0-mlogloss:1.24287
[18]	validation_0-mlogloss:1.23789
[19]	validation_0-mlogloss:1.23304
[20]	validation_0-mlogloss:1.22837
[21]	validation_0-mlogloss:1.22383
[22]	validation_0-mlogloss:1.21952
[23]	validation_0-mlogloss:1.21542
[24]	validation_0-mlogloss:1.21159
[25]	validation_0-mlogloss:1.20769
[26]	validation_0-mlogloss:1.20404
[27]	validation_0-mlogloss:1.20040
[28]	validation_0-mlogloss:1.1

 40%|████      | 2/5 [00:27<00:40, 13.43s/it]

[0]	validation_0-mlogloss:1.37509
[1]	validation_0-mlogloss:1.36448
[2]	validation_0-mlogloss:1.35442
[3]	validation_0-mlogloss:1.34471
[4]	validation_0-mlogloss:1.33550
[5]	validation_0-mlogloss:1.32696
[6]	validation_0-mlogloss:1.31864
[7]	validation_0-mlogloss:1.31044
[8]	validation_0-mlogloss:1.30286
[9]	validation_0-mlogloss:1.29553
[10]	validation_0-mlogloss:1.28859
[11]	validation_0-mlogloss:1.28192
[12]	validation_0-mlogloss:1.27561
[13]	validation_0-mlogloss:1.26950
[14]	validation_0-mlogloss:1.26362
[15]	validation_0-mlogloss:1.25814
[16]	validation_0-mlogloss:1.25289
[17]	validation_0-mlogloss:1.24779
[18]	validation_0-mlogloss:1.24280
[19]	validation_0-mlogloss:1.23816
[20]	validation_0-mlogloss:1.23370
[21]	validation_0-mlogloss:1.22951
[22]	validation_0-mlogloss:1.22544
[23]	validation_0-mlogloss:1.22143
[24]	validation_0-mlogloss:1.21763
[25]	validation_0-mlogloss:1.21395
[26]	validation_0-mlogloss:1.21053
[27]	validation_0-mlogloss:1.20705
[28]	validation_0-mlogloss:1.2

 60%|██████    | 3/5 [00:42<00:28, 14.19s/it]

[0]	validation_0-mlogloss:1.37505
[1]	validation_0-mlogloss:1.36461
[2]	validation_0-mlogloss:1.35429
[3]	validation_0-mlogloss:1.34453
[4]	validation_0-mlogloss:1.33547
[5]	validation_0-mlogloss:1.32669
[6]	validation_0-mlogloss:1.31817
[7]	validation_0-mlogloss:1.31035
[8]	validation_0-mlogloss:1.30268
[9]	validation_0-mlogloss:1.29544
[10]	validation_0-mlogloss:1.28848
[11]	validation_0-mlogloss:1.28180
[12]	validation_0-mlogloss:1.27543
[13]	validation_0-mlogloss:1.26919
[14]	validation_0-mlogloss:1.26332
[15]	validation_0-mlogloss:1.25772
[16]	validation_0-mlogloss:1.25235
[17]	validation_0-mlogloss:1.24741
[18]	validation_0-mlogloss:1.24244
[19]	validation_0-mlogloss:1.23767
[20]	validation_0-mlogloss:1.23331
[21]	validation_0-mlogloss:1.22906
[22]	validation_0-mlogloss:1.22491
[23]	validation_0-mlogloss:1.22090
[24]	validation_0-mlogloss:1.21701
[25]	validation_0-mlogloss:1.21343
[26]	validation_0-mlogloss:1.20989
[27]	validation_0-mlogloss:1.20644
[28]	validation_0-mlogloss:1.2

 80%|████████  | 4/5 [00:54<00:13, 13.47s/it]

[0]	validation_0-mlogloss:1.37476
[1]	validation_0-mlogloss:1.36386
[2]	validation_0-mlogloss:1.35348
[3]	validation_0-mlogloss:1.34360
[4]	validation_0-mlogloss:1.33424
[5]	validation_0-mlogloss:1.32523
[6]	validation_0-mlogloss:1.31651
[7]	validation_0-mlogloss:1.30831
[8]	validation_0-mlogloss:1.30057
[9]	validation_0-mlogloss:1.29303
[10]	validation_0-mlogloss:1.28579
[11]	validation_0-mlogloss:1.27907
[12]	validation_0-mlogloss:1.27237
[13]	validation_0-mlogloss:1.26606
[14]	validation_0-mlogloss:1.26008
[15]	validation_0-mlogloss:1.25424
[16]	validation_0-mlogloss:1.24873
[17]	validation_0-mlogloss:1.24349
[18]	validation_0-mlogloss:1.23846
[19]	validation_0-mlogloss:1.23368
[20]	validation_0-mlogloss:1.22896
[21]	validation_0-mlogloss:1.22457
[22]	validation_0-mlogloss:1.22037
[23]	validation_0-mlogloss:1.21613
[24]	validation_0-mlogloss:1.21221
[25]	validation_0-mlogloss:1.20850
[26]	validation_0-mlogloss:1.20497
[27]	validation_0-mlogloss:1.20157
[28]	validation_0-mlogloss:1.1

100%|██████████| 5/5 [01:06<00:00, 13.26s/it]


In [78]:
print(f"- cv acc: {np.mean(acc_lst_cat)}, auroc: {np.mean(auroc_lst_cat)}")

- cv acc: 0.44965753424657534, auroc: 0.6286540347800857


### 3. Catboost
- cv accuracy = 0.4474

In [79]:
cat_params = {
    "iterations": 120,  # 원래 1000이었지만, 최적화 결과(120)를 고려하여 중간값으로 설정
    "learning_rate": 0.1284644554526709,  # 최적화 결과 사용
    "depth": 4,  # 최적화 결과 사용
    "loss_function": "MultiClass",  # 원래 설정 유지
    "eval_metric": "AUC",  # 원래 설정 유지
    "random_seed": 42,  # 원래 설정 유지
    "verbose": 100,  # 원래 설정 유지
    "l2_leaf_reg": 5.234014807063696,  # 최적화 결과 사용
    "bagging_temperature": 1,  # 원래 설정 유지
    "cat_features": [],  # 원래 설정 유지
    "early_stopping_rounds": 50  # 새로 추가, 조기 종료를 위해
}

In [80]:
from tqdm import tqdm
acc_lst_cat = []
auroc_lst_cat = []
for i in tqdm(range(5)):
    x_train, x_valid, y_train, y_valid = x_train_cv[i], x_valid_cv[i], y_train_cv[i], y_valid_cv[i]
    train_data, valid_data = _Dataset('CatBoost', x_train, x_valid, y_train, y_valid)
    catboost_model, y_valid_pred, accuracy, auroc = train_model('CatBoost', cat_params, x_train, x_valid, y_train, y_valid, train_data, valid_data)
    acc_lst_cat.append(accuracy)
    auroc_lst_cat.append(auroc)

100%|██████████| 5/5 [01:11<00:00, 14.35s/it]


In [81]:
print(f"- cv acc: {np.mean(acc_lst_cat)}, auroc: {np.mean(auroc_lst_cat)}")

- cv acc: 0.4474885844748859, auroc: 0.6434526726067679


# Inference

In [83]:
x_train_full = train_df.drop(["target", "ID", "target_direction"], axis = 1)
y_train_full = train_df["target"].astype(int)
train_data = xgb.DMatrix(x_train_full, label=y_train_full)
xgb_model = final_train_model("XGB", xgb_params, x_train_full, y_train_full, train_data)

In [85]:
submission_df = pd.read_csv('../data/sample_submission.csv')
submission_df = test('XGB', drop_colunm = ["target", "ID", "target_direction"], model = xgb_model, test_df = test_df, submission_df = submission_df)

In [86]:
submission_df.target.value_counts()

target
2    1762
1     924
0      73
3      33
Name: count, dtype: int64

In [140]:
submission_df.to_csv("output/0926_cat(dir_pred)_xgb__hypertune.csv", index=False)