### Library Import

In [84]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go


### Data Load

In [85]:
# 파일 호출
data_path: str = "/data/ephemeral/home/BTC/data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [86]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:02<00:00, 38.25it/s]


### EDA (Explanatory Data Analysis)

### Feature engineering

In [87]:
from data_preprocessing import *

In [88]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close": "close",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_network-data_difficulty_difficulty": "difficulty",
    "hourly_network-data_supply_supply_total": "supply_total",
    "hourly_network-data_utxo-count_utxo_count": "utxo_count"
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 8)

In [89]:
# continuous 열을 따로 할당해둠
conti_cols: List[str] = [
    "close",
    "open_interest",
    "difficulty",
    "supply_total",
    "utxo_count"
]

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

In [90]:
# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

## Model Training
### LightGBM + GridsearchCV

In [91]:
from sklearn.model_selection import KFold
import optuna

X_train = train_df.drop(["ID", "target", "close"], axis=1)
y_train = train_df["close"]
X_test = test_df.drop(["ID", "target", "close"], axis=1)
y_test = test_df["close"]
target = train_df["target"]

In [92]:
def close_to_class(series: pd.Series) -> pd.Series:
    """close 변수를 target값으로 변환하는 함수입니다.

    Args:
        series (pd.Series): 변환을 원하는 close 변수

    Returns:
        pd.Series: 변환된 target 값
    """
    close = pd.DataFrame()
    close['close'] = series
    close['close_lag1'] = close['close'].shift(1)
    close['close_lag1_percent'] = (close['close'] - close['close_lag1']) / close['close_lag1']
    close['class'] = close['close']
    for i in range(close.shape[0]):
        if close.loc[i, 'close_lag1_percent'] < -0.005:
            close.loc[i, 'class'] = 0
        elif close.loc[i, 'close_lag1_percent'] < 0:
            close.loc[i, 'class'] = 1
        elif close.loc[i, 'close_lag1_percent'] < 0.005:
            close.loc[i, 'class'] = 2
        else:
            close.loc[i, 'class'] = 3
            
    return close['class'].shift(-1).fillna(method='ffill')

In [93]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
import warnings
warnings.filterwarnings("ignore")

def evaluate(valid_target, y_valid, y_pred, metric):
    if metric == 'accuracy':
        classes_pred = close_to_class(y_pred)
        return accuracy_score(valid_target, classes_pred)
    if metric == 'mae':
        return mean_absolute_error(y_valid, y_pred)
    if metric == "mse":
        return mean_squared_error(y_valid, y_pred)
    if metric == "mape":
        mae = mean_absolute_percentage_error(y_valid, y_pred)
        return mae / np.mean(y_valid)

In [94]:
def model_train(model, X_train, y_train, cv, metric):
    #kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    kfold = KFold(n_splits=cv)
    score_list = []
    for train_index, valid_index in kfold.split(X_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_valid, y_valid = X_train.iloc[valid_index], y_train.iloc[valid_index]
        
        vaild_target = target.iloc[valid_index]

        # preprocessing
        X_train_fold.fillna(X_train_fold.mean(), inplace=True)
        y_train_fold.fillna(y_train_fold.mean(), inplace=True) 
        X_valid.fillna(X_valid.mean(), inplace=True)
        y_valid.fillna(y_valid.mean(), inplace=True)  # 이 부분을 mice와 같은 방법으로 조정할 예정. feature selection 등도 여기에서.

        #model.fit(X_train_fold.drop("target", axis=1), y_train_fold)
        model.fit(X_train_fold, y_train_fold)
        #y_pred = model.predict(X_valid.drop("target", axis=1))
        y_pred = model.predict(X_valid)
        score = evaluate(vaild_target, y_valid, y_pred, metric=metric)
        score_list.append(score)

    return np.mean(score_list)

In [95]:
def objective(trial):
    # 하이퍼파라미터 설정
    params = {
        "boosting_type" : "gbdt",
        "metric" : "mean_squared_error",
        "verbose" : -1,
        "num_leaves" : trial.suggest_int("num_leaves", 20, 100),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 10, 100),
        "max_depth" : trial.suggest_int("max_depth", 3, 10), # 과적합 방지
        "min_child_weight" : trial.suggest_int("min_child_weight", 1, 10), # 과소적합 방지
        "subsample" : trial.suggest_uniform("subsample", 0.5, 1.0), # 데이터 샘플링 비율, 과적합 방지
        "colsample_bytree" : trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        # "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-4, 10.0),
        # "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 10.0),
        "random_state" : 42,
        "force_col_wise": True,
        #"device" : "gpu", 
    }

    lgb_model = lgb.LGBMRegressor(**params)
    acc = model_train(lgb_model, X_train, y_train, cv=5, metric="accuracy")

    return acc


In [96]:
# optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# best params
print("Best Hyperparameters: ", study.best_params)

[I 2024-09-24 18:26:14,476] A new study created in memory with name: no-name-afb137f1-01fd-4a0f-80e8-850180f20516
[I 2024-09-24 18:26:19,228] Trial 0 finished with value: 0.38789954337899546 and parameters: {'num_leaves': 73, 'learning_rate': 0.01115435422614097, 'n_estimators': 73, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.9052745886939833, 'colsample_bytree': 0.8747326112554548}. Best is trial 0 with value: 0.38789954337899546.
[I 2024-09-24 18:26:22,003] Trial 1 finished with value: 0.4094748858447489 and parameters: {'num_leaves': 30, 'learning_rate': 0.0031428113742668355, 'n_estimators': 43, 'max_depth': 4, 'min_child_weight': 10, 'subsample': 0.7395567432021037, 'colsample_bytree': 0.5334275566344353}. Best is trial 1 with value: 0.4094748858447489.
[I 2024-09-24 18:26:25,055] Trial 2 finished with value: 0.38515981735159815 and parameters: {'num_leaves': 42, 'learning_rate': 0.0584399351689488, 'n_estimators': 27, 'max_depth': 9, 'min_child_weight': 4, 'subsample':

Best Hyperparameters:  {'num_leaves': 40, 'learning_rate': 0.001241478711853387, 'n_estimators': 23, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.7555628202792165, 'colsample_bytree': 0.6112268450129043}


In [97]:
# 최적의 모델
best_params = study.best_params
best_lgb_model = lgb.LGBMRegressor(**best_params, random_state=42)

In [98]:
avg = model_train(best_lgb_model, X_train, y_train, cv=5, metric="accuracy")

print("LGB model accuracy:", avg)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24801
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 119
[LightGBM] [Info] Start training from score 30562.564983
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24849
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 119
[LightGBM] [Info] Start training from score 29000.900340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24849
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 119
[LightGBM] [Info

### Inference

In [99]:
X_test = test_df.drop(["ID", "target", "close"], axis=1)
y_test = test_df["close"]

# 결측치 처리
X_test.fillna(X_test.mean(), inplace=True)
y_test.fillna(y_test.mean(), inplace=True)

In [100]:
# lgb predict
y_test_pred = best_lgb_model.predict(X_test)
y_test_pred_class = close_to_class(y_test_pred) # 예측 결과를 범주형 변수로 변환

### Output File Save

In [101]:
# output file 할당후 save 
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df["target"] = submission_df["target"].astype(np.int8)
submission_df.to_csv("output.csv", index=False)