### Library Import

In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

import warnings
warnings.filterwarnings('ignore')


### Data Load

In [2]:
# 파일 호출
data_path: str = "../data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:01<00:00, 54.21it/s]


### Feature engineering

In [4]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 19)

In [5]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    
    buy_sell_ratio_change = df['buy_sell_ratio'].pct_change(1)
)
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
    "buy_sell_ratio_change"
]

In [6]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [7]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### Model Training

In [9]:
train_df

Unnamed: 0,ID,target,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,long_liquidations_usd,short_liquidations,short_liquidations_usd,open_interest,...,buy_sell_ratio_change_14,buy_sell_ratio_change_15,buy_sell_ratio_change_16,buy_sell_ratio_change_17,buy_sell_ratio_change_18,buy_sell_ratio_change_19,buy_sell_ratio_change_20,buy_sell_ratio_change_21,buy_sell_ratio_change_22,buy_sell_ratio_change_23
0,2023-01-01 00:00:00,2.0,-9.86,-0.059650,0.005049,0.012000,1.975161e+02,0.0000,0.000000e+00,6.271344e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
1,2023-01-01 01:00:00,1.0,-8.78,-0.053047,0.005049,0.000000,0.000000e+00,0.7120,1.183356e+04,6.288683e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
2,2023-01-01 02:00:00,1.0,-9.59,-0.057952,0.005049,0.000000,0.000000e+00,0.0000,0.000000e+00,6.286796e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
3,2023-01-01 03:00:00,1.0,-9.74,-0.058912,0.005067,0.593000,9.754769e+03,0.0000,0.000000e+00,6.284575e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
4,2023-01-01 04:00:00,2.0,-10.14,-0.061373,0.006210,0.361000,5.944437e+03,0.0000,0.000000e+00,6.291582e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,1.0,-27.10,-0.063587,0.036371,0.163000,6.924677e+03,0.1040,4.437410e+03,1.054230e+10,...,-0.129171,-0.174591,1.019229,0.055571,-0.524463,0.409076,0.713464,-0.411591,0.427440,-0.214961
8756,2023-12-31 20:00:00,1.0,-24.73,-0.058109,0.037233,29.698896,1.263031e+06,43.8396,1.870481e+06,1.051484e+10,...,0.117505,-0.129171,-0.174591,1.019229,0.055571,-0.524463,0.409076,0.713464,-0.411591,0.427440
8757,2023-12-31 21:00:00,0.0,-28.48,-0.066979,0.037761,0.325000,1.385218e+04,1.6790,7.179552e+04,1.048598e+10,...,-0.047627,0.117505,-0.129171,-0.174591,1.019229,0.055571,-0.524463,0.409076,0.713464,-0.411591
8758,2023-12-31 22:00:00,2.0,-9.08,-0.021487,0.038020,90.293123,3.815777e+06,7.7816,3.310213e+05,1.032844e+10,...,-0.254134,-0.047627,0.117505,-0.129171,-0.174591,1.019229,0.055571,-0.524463,0.409076,0.713464


In [11]:
# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": -1,
}

# TimeSeriesSplit 설정: 5-fold
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(tscv.split(train_df)):
    print(f"Fold {fold+1}")
    X_train, X_valid = train_df.drop(['ID', 'target'], axis=1).loc[train_idx], train_df.drop(['ID', 'target'], axis=1).loc[valid_idx]
    y_train, y_valid = train_df.loc[train_idx]['target'].astype(int), train_df.loc[valid_idx]['target'].astype(int)
    
    # Standard Scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    
    # 모델 학습
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              callbacks=[lgb.early_stopping(100)])
    
    # 예측 및 성능 평가
    y_train_pred = model.predict_proba(X_train)
    y_train_pred_class = np.argmax(y_train_pred, axis=1)
    y_valid_pred = model.predict_proba(X_valid)
    y_valid_pred_class = np.argmax(y_valid_pred, axis=1)
    
    # score check
    train_acc = accuracy_score(y_train, y_train_pred_class)
    train_auc = roc_auc_score(y_train, y_train_pred, multi_class='ovr')
    valid_acc = accuracy_score(y_valid, y_valid_pred_class)
    valid_auc = roc_auc_score(y_valid, y_valid_pred, multi_class='ovr')
    
    print(f'Train Acc: {train_acc}, Train AUC: {train_auc} | Valid Acc: {valid_acc}, Valid AUC: {valid_auc}')

Fold 1
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4]	valid_0's multi_logloss: 1.20453
Train Acc: 0.7876712328767124, Train AUC: 0.9815266252382092 | Valid Acc: 0.3780821917808219, Valid AUC: 0.5401548479070304
Fold 2
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[4]	valid_0's multi_logloss: 1.20194
Train Acc: 0.7102739726027397, Train AUC: 0.9566369736123996 | Valid Acc: 0.4095890410958904, Valid AUC: 0.5364500115559951
Fold 3
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[5]	valid_0's multi_logloss: 1.21187
Train Acc: 0.6785388127853881, Train AUC: 0.931914523443497 | Valid Acc: 0.4089041095890411, Valid AUC: 0.5394308491671082
Fold 4
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[28]	valid_0's multi_logloss: 1.0351
Train Acc: 0.

In [12]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")

acc: 0.4149543378995434, auroc: 0.6081580523345377


In [13]:
# performance 체크후 전체 학습 데이터로 다시 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)

### Inference

In [14]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

### Output File Save

In [15]:
# output file 할당후 save 
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("../output.csv", index=False)