### Library Import

In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go


### Data Load

In [2]:
# 파일 호출
data_path: str = "/data/ephemeral/home/BTC/data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:03<00:00, 28.56it/s]


### EDA (Explanatory Data Analysis)

### Feature engineering

In [4]:
from data_preprocessing import *



In [5]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",

    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",

    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",

    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close": "close",
    
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "taker_buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "taker_sell_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "taker_buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "taker_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "taker_buy_sell_ratio",

    "hourly_network-data_addresses-count_addresses_count_active": "addresses_count_active",
    "hourly_network-data_addresses-count_addresses_count_sender": "addresses_count_sender",
    "hourly_network-data_addresses-count_addresses_count_receiver": "addresses_count_receiver",

    "hourly_network-data_block-bytes_block_bytes": "block_bytes",

    "hourly_network-data_block-count_block_count": "block_count",
    
    "hourly_network-data_block-interval_block_interval": "block_interval",

    "hourly_network-data_blockreward_blockreward": "blockreward",
    "hourly_network-data_blockreward_blockreward_usd": "blockreward_usd",
    
    "hourly_network-data_difficulty_difficulty": "difficulty",

    "hourly_network-data_fees_fees_block_mean": "fees_block_mean",
    "hourly_network-data_fees_fees_block_mean_usd": "fees_block_mean_usd",
    "hourly_network-data_fees_fees_total": "fees_total",
    "hourly_network-data_fees_fees_total_usd": "fees_total_usd",
    "hourly_network-data_fees_fees_reward_percent": "fees_reward_percent",

    "hourly_network-data_fees-transaction_fees_transaction_mean": "fees_transaction_mean",
    "hourly_network-data_fees-transaction_fees_transaction_mean_usd": "fees_transaction_mean_usd",
    "hourly_network-data_fees-transaction_fees_transaction_median": "fees_transaction_median",
    "hourly_network-data_fees-transaction_fees_transaction_median_usd": "fees_transaction_median_usd",

    "hourly_network-data_hashrate_hashrate": "hashrate",

    "hourly_network-data_supply_supply_total": "supply_total",
    "hourly_network-data_supply_supply_new": "supply_new",

    "hourly_network-data_tokens-transferred_tokens_transferred_total": "tokens_transferred_total",
    "hourly_network-data_tokens-transferred_tokens_transferred_mean": "tokens_transferred_mean",
    "hourly_network-data_tokens-transferred_tokens_transferred_median": "tokens_transferred_median",

    "hourly_network-data_transactions-count_transactions_count_total": "transactions_count_total",
    "hourly_network-data_transactions-count_transactions_count_mean": "transactions_count_mean",

    "hourly_network-data_utxo-count_utxo_count": "utxo_count",
    
    "hourly_network-data_velocity_velocity_supply_total": "velocity_supply_total"
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 43)

In [6]:
# continuous 열을 따로 할당해둠
conti_cols: List[str] = [
    "close",
    "open_interest",
    # "difficulty",
    # "supply_total",
    "utxo_count"
]

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 6)]
)

# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

In [7]:
df

Unnamed: 0,ID,target,_type,funding_rates,long_liquidations,short_liquidations,long_liquidations_usd,short_liquidations_usd,open_interest,close,...,open_interest_1,open_interest_2,open_interest_3,open_interest_4,open_interest_5,utxo_count_1,utxo_count_2,utxo_count_3,utxo_count_4,utxo_count_5
0,2023-01-01 00:00:00,2.0,train,0.005049,0.012000,0.000000,197.51610,0.00000,6.271344e+09,16536.747967,...,,,,,,,,,,
1,2023-01-01 01:00:00,1.0,train,0.005049,0.000000,0.712000,0.00000,11833.56104,6.288683e+09,16557.136536,...,6.271344e+09,,,,,83308092.0,,,,
2,2023-01-01 02:00:00,1.0,train,0.005049,0.000000,0.000000,0.00000,0.00000,6.286796e+09,16548.149805,...,6.288683e+09,6.271344e+09,,,,83314883.0,83308092.0,,,
3,2023-01-01 03:00:00,1.0,train,0.005067,0.593000,0.000000,9754.76891,0.00000,6.284575e+09,16533.632875,...,6.286796e+09,6.288683e+09,6.271344e+09,,,83314090.0,83314883.0,83308092.0,,
4,2023-01-01 04:00:00,2.0,train,0.006210,0.361000,0.000000,5944.43714,0.00000,6.291582e+09,16524.712159,...,6.284575e+09,6.286796e+09,6.288683e+09,6.271344e+09,,83326258.0,83314090.0,83314883.0,83308092.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,,0.710000,0.243500,45484.20433,15682.76464,1.486836e+10,,...,1.489030e+10,1.481352e+10,1.475586e+10,1.477917e+10,1.478950e+10,179811932.0,179793126.0,179753959.0,179728506.0,179718697.0
11548,2024-04-26 04:00:00,,test,,6.577208,0.146000,420718.03779,9419.65430,,,...,1.486836e+10,1.489030e+10,1.481352e+10,1.475586e+10,1.477917e+10,179820708.0,179811932.0,179793126.0,179753959.0,179728506.0
11549,2024-04-26 05:00:00,,test,,1.797163,5.216490,114902.59095,337367.12807,,,...,,1.486836e+10,1.489030e+10,1.481352e+10,1.475586e+10,179833897.0,179820708.0,179811932.0,179793126.0,179753959.0
11550,2024-04-26 06:00:00,,test,,0.803000,1.656000,51434.51531,106931.54104,,,...,,,1.486836e+10,1.489030e+10,1.481352e+10,179851249.0,179833897.0,179820708.0,179811932.0,179793126.0


In [8]:
# 초기 lasso 결과 가중치가 0으로 나온 피처들. 빼고 돌려보려면 이 블록을 주석해제할 것.
omit_list = [
    'funding_rates',
    'taker_buy_ratio',
    'taker_sell_ratio',
    'taker_buy_sell_ratio',
    'block_count',
    'fees_block_mean',
    'fees_total',
    'fees_reward_percent',
    'fees_transaction_mean',
    'fees_transaction_mean_usd',
    'fees_transaction_median',
    'fees_transaction_median_usd',
    'supply_new',
    'tokens_transferred_median'
]
df.drop(omit_list, axis=1, inplace=True)

In [9]:
# usd로 측정된 피처는 중복이라고 판단해 빼보는 실험 블록.
usd_list = [
    "long_liquidations_usd",
    "short_liquidations_usd",
    "fees_block_mean_usd",
    "fees_total_usd",
    "blockreward_usd",
    # "fees_transaction_mean_usd",      # omit_list와 중복되는 피처.
    # "fees_transaction_median_usd",    # omit_list와 중복되는 피처.
]
df.drop(usd_list, axis=1, inplace=True)

In [10]:
mean_list = [
    "tokens_transferred_mean",
    "transactions_count_mean"
]
df.drop(mean_list, axis=1, inplace=True)

In [11]:
df.drop("difficulty", axis=1, inplace=True)

In [12]:
# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### Model Training

In [13]:
def close_to_class(series: pd.Series) -> pd.Series:
    """close 변수를 target값으로 변환하는 함수입니다.

    Args:
        series (pd.Series): 변환을 원하는 close 변수

    Returns:
        pd.Series: 변환된 target 값
    """
    close = pd.DataFrame()
    close["close"] = series
    close["close_lag1"] = close["close"].shift(1)
    close["close_lag1_percent"] = (close["close"] - close["close_lag1"]) / close["close_lag1"]
    close["class"] = close["close"]
    for i in range(close.shape[0]):
        if close.loc[i, "close_lag1_percent"] < -0.005:
            close.loc[i, "class"] = 0
        elif close.loc[i, "close_lag1_percent"] < 0:
            close.loc[i, "class"] = 1
        elif close.loc[i, "close_lag1_percent"] < 0.005:
            close.loc[i, "class"] = 2
        else:
            close.loc[i, "class"] = 3
            
    return close["class"].shift(-1).fillna(method="ffill")

In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")

def model_train(
    model,
    X_train,
    y_train,
    cv: int = 5,
    metric: str = "accuracy"
) -> tuple[None, float]:
    """모델을 불러와 cross validation을 이용해 예측 성능을 확인하는 함수입니다.

    Args:
        model (Any): 학습을 수행할 모델
        X_train (Any): 훈련 데이터 피처
        y_train (Any): 훈련 데이터
        cv (int): KFold 검정 횟수
        metric (str): 성능 지표. accuracy(target), mae, mape mse 중 선택.

    Returns:
        _type_: _description_
    """
    kfold = KFold(n_splits=cv)
    score_list = []
    fold_model = []
    for train_index, valid_index in kfold.split(X_train):
        X_train_fold, y_train_fold = X_train.iloc[train_index], y_train.iloc[train_index]
        X_valid, y_valid = X_train.iloc[valid_index], y_train.iloc[valid_index]

        valid_target = X_valid["target"]
        X_train_fold.drop("target", axis=1, inplace=True)
        X_valid.drop("target", axis=1, inplace=True)

        # preprocessing
        X_train_fold.fillna(X_train_fold.mean(), inplace=True)
        y_train_fold.fillna(y_train_fold.mean(), inplace=True)
        X_valid.fillna(X_valid.mean(), inplace=True)
        y_valid.fillna(y_valid.mean(), inplace=True)

        model.fit(X_train_fold, y_train_fold)
        fold_model.append(model)

        y_pred = model.predict(X_valid)
        score = evaluate(valid_target, y_valid, y_pred, metric=metric)
        score_list.append(score)
    
    return fold_model[np.argmax(score_list)], np.max(score_list)

def evaluate(valid_target, y_valid, y_pred, metric):
    if metric == 'accuracy':
        classes = close_to_class(y_pred)
        return accuracy_score(valid_target, classes)
    if metric == 'mae':
        return mean_absolute_error(y_valid, y_pred)
    if metric == "mse":
        return mean_squared_error(y_valid, y_pred)
    if metric == "mape":
        return mean_absolute_percentage_error(y_valid, y_pred)

In [15]:
X_train = train_df.drop(["ID", "close"], axis=1)
y_train = train_df["close"]
X_test = test_df.drop(["ID", "close"], axis=1)
y_test = test_df["close"]
X_test.fillna(X_test.mean(), inplace=True)
y_test.fillna(y_test.mean(), inplace=True)

lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=20, random_state=42)
lasso_reg = Lasso(alpha=20, max_iter=2000, random_state=42)
lr_model, lr_score = model_train(lr_reg, X_train, y_train, cv=5, metric="accuracy")
ridge_model, ridge_score = model_train(ridge_reg, X_train, y_train, cv=5, metric="accuracy")
lasso_model, lasso_score = model_train(lasso_reg, X_train, y_train, cv=5, metric="accuracy")
print(f"{lr_reg}: {lr_score}")
print(f"{ridge_reg}: {ridge_score}")
print(f"{lasso_reg}: {lasso_score}")

LinearRegression(): 0.6421232876712328
Ridge(alpha=20, random_state=42): 0.6421232876712328
Lasso(alpha=20, max_iter=2000, random_state=42): 0.6626712328767124


### Inference (Linear Regression)

In [16]:
# lr predict
X_train_drop = fill_feature(X_train.drop("target", axis=1), method="mean")
lasso_model.fit(X_train_drop, y_train)

y_test_pred = lasso_model.predict(X_test.drop("target", axis=1))
y_test_pred_class = close_to_class(y_test_pred)

pd.DataFrame(y_test_pred).to_csv("predicted_values_xgb.csv", index=False)

In [17]:
# lr output
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df["target"] = submission_df["target"].astype(np.int8)
submission_df.to_csv("output.csv", index=False)