해당 파일은 최종 제출 목록 중 Ensemble_V2에 대한 output을 출력하는 파일입니다.

## Library Import

In [1]:
import os
from typing import Any, List, Dict, Tuple
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
import lightgbm as lgb
import optuna
import seaborn as sns
from sklearn.ensemble import VotingRegressor
from data_preprocessing import *
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
# 파일 호출
data_path: str = "/data/ephemeral/home/BTC/data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:03<00:00, 27.79it/s]


In [4]:
# EDA를 위한 2023년 훈련 데이터프레임 생성
eda_df = df.loc[df["_type"] == "train"]
eda_df.head()

Unnamed: 0,ID,target,_type,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations_usd,hourly_network-data_supply_supply_total,hourly_network-data_supply_supply_new,hourly_market-data_open-interest_binance_all_symbol_open_interest,...,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_open-interest_htx_global_btc_usdt_open_interest,hourly_market-data_open-interest_bitfinex_open_interest,hourly_market-data_liquidations_binance_btc_busd_long_liquidations,hourly_market-data_liquidations_binance_btc_busd_short_liquidations,hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd,hourly_market-data_liquidations_binance_btc_busd_short_liquidations_usd,hourly_network-data_transactions-count_transactions_count_total,hourly_network-data_transactions-count_transactions_count_mean,hourly_market-data_open-interest_okx_btc_usdt_open_interest
0,2023-01-01 00:00:00,2.0,train,0.0,0.0,0.0,0.0,19248710.0,75.0,2411675000.0,...,0.01,67842880.0,,0.0,0.0,0.0,0.0,11457.0,954.75,540626700.0
1,2023-01-01 01:00:00,1.0,train,0.0,0.0,0.0,0.0,19248740.0,25.0,2415163000.0,...,0.01,67889410.0,,0.0,0.0,0.0,0.0,5832.0,1458.0,542386000.0
2,2023-01-01 02:00:00,1.0,train,0.0,0.0,0.0,0.0,19248790.0,50.0,2416327000.0,...,0.01,67816570.0,,0.0,0.0,0.0,0.0,5550.0,693.75,541087100.0
3,2023-01-01 03:00:00,1.0,train,0.0,0.0,0.0,0.0,19248820.0,31.25,2416266000.0,...,0.01,67981920.0,,0.0,0.0,0.0,0.0,5245.0,1049.0,540094200.0
4,2023-01-01 04:00:00,2.0,train,0.0,0.0,0.0,0.0,19248860.0,43.75,2417820000.0,...,0.01,68290020.0,,0.0,0.0,0.0,0.0,6942.0,991.714286,539998000.0


 EDA 부분은 Final_Ensemble_test.ipynb에 저장했습니다.
 해당 파일에서의 예측은 EDA 기반이 아닌 변수선택법을 이용한 예측이므로 바로 Feature Engineering 파트로 넘어갔습니다.

# Feature Engineering

In [24]:
# Ensemble_V2
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",

    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close": "close",    
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "taker_buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "taker_sell_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "taker_buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "taker_sell_ratio",
    "hourly_network-data_block-bytes_block_bytes": "block_bytes",    
    "hourly_network-data_difficulty_difficulty": "difficulty",
    "hourly_network-data_fees_fees_block_mean": "fees_block_mean",
    "hourly_network-data_fees_fees_block_mean_usd": "fees_block_mean_usd",
    "hourly_network-data_fees-transaction_fees_transaction_mean_usd": "fees_transaction_mean_usd",
    "hourly_network-data_fees-transaction_fees_transaction_median": "fees_transaction_median",
    "hourly_network-data_fees-transaction_fees_transaction_median_usd": "fees_transaction_median_usd",
    "hourly_network-data_supply_supply_total": "supply_total",
    "hourly_network-data_tokens-transferred_tokens_transferred_median": "tokens_transferred_median",
    "hourly_network-data_utxo-count_utxo_count": "utxo_count",  
    "hourly_network-data_velocity_velocity_supply_total": "velocity_supply_total"
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 25)

In [25]:
# continuous 열을 따로 할당해둠
conti_cols: List[str] = [
    "close",
    "open_interest",
    "difficulty",
    "supply_total",
    "utxo_count",
]

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 6)]
)

# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

In [26]:
df.head()

Unnamed: 0,ID,target,_type,funding_rates,long_liquidations,short_liquidations,long_liquidations_usd,short_liquidations_usd,open_interest,close,...,supply_total_1,supply_total_2,supply_total_3,supply_total_4,supply_total_5,utxo_count_1,utxo_count_2,utxo_count_3,utxo_count_4,utxo_count_5
0,2023-01-01 00:00:00,2.0,train,0.005049,0.012,0.0,197.5161,0.0,6271344000.0,16536.747967,...,,,,,,,,,,
1,2023-01-01 01:00:00,1.0,train,0.005049,0.0,0.712,0.0,11833.56104,6288683000.0,16557.136536,...,19248710.0,,,,,83308092.0,,,,
2,2023-01-01 02:00:00,1.0,train,0.005049,0.0,0.0,0.0,0.0,6286796000.0,16548.149805,...,19248740.0,19248710.0,,,,83314883.0,83308092.0,,,
3,2023-01-01 03:00:00,1.0,train,0.005067,0.593,0.0,9754.76891,0.0,6284575000.0,16533.632875,...,19248790.0,19248740.0,19248710.0,,,83314090.0,83314883.0,83308092.0,,
4,2023-01-01 04:00:00,2.0,train,0.00621,0.361,0.0,5944.43714,0.0,6291582000.0,16524.712159,...,19248820.0,19248790.0,19248740.0,19248710.0,,83326258.0,83314090.0,83314883.0,83308092.0,


In [27]:
# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

# Model Train

In [28]:
X_train = train_df.drop(["ID", "target", "close"], axis=1)
y_train = train_df["close"]
target = train_df["target"]

In [29]:
def close_to_class(series: pd.Series) -> pd.Series:
    """close 변수를 target값으로 변환하는 함수입니다.

    Args:
        series (pd.Series): 변환을 원하는 close 변수

    Returns:
        pd.Series: 변환된 target 값
    """
    close = pd.DataFrame()
    close['close'] = series
    close['close_lag1'] = close['close'].shift(1)
    close['close_lag1_percent'] = (close['close'] - close['close_lag1']) / close['close_lag1']
    close['class'] = close['close']
    for i in range(close.shape[0]):
        if close.loc[i, 'close_lag1_percent'] < -0.005:
            close.loc[i, 'class'] = 0
        elif close.loc[i, 'close_lag1_percent'] < 0:
            close.loc[i, 'class'] = 1
        elif close.loc[i, 'close_lag1_percent'] < 0.005:
            close.loc[i, 'class'] = 2
        else:
            close.loc[i, 'class'] = 3
            
    return close["class"].shift(-1).fillna(method="ffill")

In [30]:
# 모델 평가
def evaluate(valid_target: pd.Series, 
             y_valid: pd.Series, 
             y_pred: np.ndarray, 
             metric: str
) -> float:
    """평가지표 metric을 반환하는 함수입니다.

    Args:
        valid_target: (pd.Series): k-fold로 분할한 target의 검증 데이터
        y_valid (pd.Series): k-fold로 분할한 close의 검증 데이터
        y_pred (np.ndarray): 모델을 사용하여 예측한 변수
        metric (str): 사용할 평가지표 metric 이름

    Returns:
        float: 사용할 평가지표 metric 값
    """
    if metric == "accuracy":
        y_pred_class = close_to_class(y_pred)
        return accuracy_score(valid_target, y_pred_class)
    elif metric == "mae":
        return mean_absolute_error(y_valid, y_pred)
    elif metric == "mse":
        return mean_squared_error(y_valid, y_pred)
    elif metric == "mape":
        return mean_absolute_percentage_error(y_valid, y_pred)

In [31]:
# 교차 검증
def model_train(model: Any, 
                X_train: pd.DataFrame, 
                y_train: pd.Series, 
                cv: int, 
                metric: str, 
) -> Tuple[Any, float]:
    """K-Fold로 데이터를 분할한 후 전처리를 거쳐 주어진 모델로 데이터를 학습 및 평가를 진행합니다.

    Args:
        model (Any): 사용하는 모델 객체
        X_train (pd.DataFrame): 설명변수로 이루어진 학습 데이터프레임
        y_train (pd.Seris): 예측변수로 이루어진 학습 시리즈
        cv (int): 교차검증시 분할할 폴드의 수
        metric (str): 사용할 평가지표 metric 이름

    Returns:
        Any, float: 폴드 내에서 가장 평가지표 값이 높은 모델 객체, 평가지표 metric 값
    """
    kfold = KFold(n_splits=cv)
    score_list = []
    fold_model = []
    
    # warm_start는 모델의 속성으로, 같은 모델을 반복 학습할 때 이전 학습에서 학습된 파라미터를 초기화하지 않고 이어서 학습을 진행하는 옵션
    if hasattr(model, "warm_start"):
        model.warm_start = True

    # K-Fold 교차 검증
    for train_index, valid_index in kfold.split(X_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_valid, y_valid = X_train.iloc[valid_index], y_train.iloc[valid_index]

        valid_target = target[valid_index]
        
        # 전처리
        fill_feature(X_train_fold, method="mean")
        fill_feature(X_valid, method="mean")        
        
        # 모델 학습
        model.fit(X_train_fold, y_train_fold)
        fold_model.append(model)

        y_pred = model.predict(X_valid)
        score = evaluate(valid_target, y_valid, y_pred, metric=metric)  # 평가지표 metric 반환
        score_list.append(score)
    
    return fold_model[np.argmax(score_list)], np.max(score_list)

## Model Ensemble

In [32]:
# 다음 두 하이퍼파라미터 조합은 각각의 모델링에서 optuna를 통해 최적의 하이퍼파라미터를 구한 결과입니다.

xgb_params = {
    'n_estimators': 159,
    'learning_rate': 0.044545369253400344,
    'max_depth': 7,
    'min_child_weight': 5,
    'colsample_bytree': 0.5878501242431816,
    'subsample': 0.7729401646786744,
    "booster": "gbtree",
    "device": "gpu",
    "random_state": 42
}

lgb_params = {
    "verbose" : -1,
    'num_leaves': 77,
    'learning_rate': 0.019576305246645095,
    'n_estimators': 294,
    'max_depth': 8,
    'min_child_weight': 1,
    'subsample': 0.6520270159934872,
    'colsample_bytree': 0.5000782016959966,
    "random_state" : 42
}

In [33]:
# 목적함수 정의
def objective(trial):
    weight_xgb = trial.suggest_float("weight_xgb", 0.1, 3.0)
    weight_lgb = trial.suggest_float("weight_lgb", 0.1, 3.0)
    weight_lr = trial.suggest_float("weight_lr", 0.1, 3.0)

    ensemble_model = VotingRegressor(
        estimators=[('xgb', XGBRegressor(**xgb_params)), 
                    ('lgb', lgb.LGBMRegressor(**lgb_params)), 
                    ('lr', Lasso(alpha=20, max_iter=2000, random_state=42))],
        weights=[weight_xgb, weight_lgb, weight_lr]
    )
    
    _, score = model_train(ensemble_model, X_train, y_train, cv=5, metric="accuracy")
    return score

In [34]:
# Optuna로 가중치 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-09-27 03:06:40,218] A new study created in memory with name: no-name-388a83ca-fe27-48bc-8f13-a3b5f3be3308
[I 2024-09-27 03:06:54,038] Trial 0 finished with value: 0.6615296803652968 and parameters: {'weight_xgb': 1.8432909639391837, 'weight_lgb': 0.6592584793464374, 'weight_lr': 2.8633778362249105}. Best is trial 0 with value: 0.6615296803652968.
[I 2024-09-27 03:07:07,606] Trial 1 finished with value: 0.6649543378995434 and parameters: {'weight_xgb': 1.3712779489063889, 'weight_lgb': 1.497347630590279, 'weight_lr': 2.4040060337175833}. Best is trial 1 with value: 0.6649543378995434.
[I 2024-09-27 03:07:21,086] Trial 2 finished with value: 0.6541095890410958 and parameters: {'weight_xgb': 2.540060352627125, 'weight_lgb': 2.21042925288701, 'weight_lr': 2.766646795061544}. Best is trial 1 with value: 0.6649543378995434.
[I 2024-09-27 03:07:35,444] Trial 3 finished with value: 0.6678082191780822 and parameters: {'weight_xgb': 0.34643713360148465, 'weight_lgb': 0.24984577620997656,

In [35]:
# 최적의 가중치를 사용한 최종 모델
best_weights = [
    study.best_params["weight_xgb"],
    study.best_params["weight_lgb"],
    study.best_params["weight_lr"]
]
ensemble_model = VotingRegressor(
    estimators=[('xgb', XGBRegressor(**xgb_params)), 
                ('lgb', lgb.LGBMRegressor(**lgb_params)), 
                ('lr', Lasso(alpha=20, max_iter=2000, random_state=42))],
    weights=best_weights
)

In [36]:
# 학습
Best_ensemble_model, model_accuracy = model_train(ensemble_model, X_train, y_train, cv=5, metric="accuracy")
print(f"Ensemble model accuracy: {model_accuracy}")

Ensemble model accuracy: 0.672945205479452


## Inference

In [37]:
# Best_ensemble_model이 학습 데이터 전체를 학습할 수 있도록 결측치 처리
X_train = fill_feature(X_train, method="mean")
X_test = test_df.drop(["ID", "target", "close"], axis=1)
X_test = fill_feature(X_test, method="mean")

In [38]:
Best_ensemble_model.fit(X_train, y_train)
y_test_pred = Best_ensemble_model.predict(X_test)
y_test_pred_class = close_to_class(y_test_pred)

In [39]:
# output file
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df["target"] = submission_df["target"].astype(np.int8)
submission_df.to_csv("output.csv", index=False)

In [40]:
submission_df['target'].value_counts()

target
1    1106
2    1091
3     299
0     296
Name: count, dtype: int64