In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Mounted at /content/drive


In [2]:
# 파일 호출
data_path: str = "/content/drive/MyDrive/AITech/W6_7_ML_Basic_PJ/data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할당
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:03<00:00, 29.51it/s]


In [8]:
train_df = df.loc[df["_type"] == "train"]
test_df = df.loc[df['_type'] == 'test']
# 각 열에서 누락된 값의 수를 계산
missing_values = train_df.isnull().sum()

# 누락된 값의 백분율 계산
missing_percentage = (missing_values / len(train_df)) * 100

# 누락된 값 비율을 기준으로 열 정렬
sorted_missing_percentage = missing_percentage.sort_values(ascending=False)
sorted_missing_percentage

Unnamed: 0,0
hourly_market-data_liquidations_okex_all_symbol_short_liquidations,100.0
hourly_market-data_liquidations_okex_btc_usdt_short_liquidations_usd,100.0
hourly_market-data_liquidations_okex_btc_usdt_short_liquidations,100.0
hourly_market-data_liquidations_okex_btc_usdt_long_liquidations,100.0
hourly_market-data_funding-rates_okex_funding_rates,100.0
...,...
target,0.0
hourly_market-data_open-interest_htx_global_all_symbol_open_interest,0.0
hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations_usd,0.0
hourly_market-data_liquidations_htx_global_btc_usd_long_liquidations_usd,0.0


# Baseline - LightGBM

In [9]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count", # 모델에 사용할 컬럼명 : 그 칼럼의 새로운 이름
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 19)

In [10]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성(feature engineering)
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
]

In [11]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [12]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### Model Training

In [13]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



acc: 0.4223744292237443, auroc: 0.6145133857950079


In [16]:
# lgb predict
x_test = test_df.drop(["target", "ID"], axis = 1)

y_test_pred = lgb_model.predict(x_test)
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [19]:
submission_df['target'] = y_test_pred_class
print(submission_df)

                       ID  target
0     2024-01-01 00:00:00       1
1     2024-01-01 01:00:00       1
2     2024-01-01 02:00:00       2
3     2024-01-01 03:00:00       2
4     2024-01-01 04:00:00       1
...                   ...     ...
2787  2024-04-26 03:00:00       2
2788  2024-04-26 04:00:00       2
2789  2024-04-26 05:00:00       2
2790  2024-04-26 06:00:00       1
2791  2024-04-26 07:00:00       1

[2792 rows x 2 columns]


In [22]:
submission_df.to_csv("/content/drive/MyDrive/AITech/W6_7_ML_Basic_PJ/sample_submission_1_lgbm")

# XGBoost

In [23]:
import xgboost as xgb

In [31]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),  # 'target'과 'ID' 열을 제거한 입력 변수
    train_df["target"].astype(int),  # 타겟 변수 (정수형으로 변환)
    test_size=0.2,  # 검증 세트 크기 20%
    random_state=42,  # 재현성을 위한 랜덤 시드
)

# XGBoost DMatrix 생성
train_data = xgb.DMatrix(data=x_train, label=y_train)  # 훈련 데이터
valid_data = xgb.DMatrix(data=x_valid, label=y_valid)  # 검증 데이터

# XGBoost params 설정
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류를 위한 소프트맥스 확률 출력
    "eval_metric": "mlogloss",  # 다중 클래스 로그 손실
    "num_class": 4,  # 클래스 개수
    "learning_rate": 0.05,  # 학습률
    "max_depth": 6,  # 최대 깊이
    "n_estimators": 30,  # 트리 개수
    "random_state": 42,  # 랜덤 시드
    "verbosity": 1  # 학습 과정 출력
}

# XGBoost 모델 훈련
xgb_model = xgb.train(
    params=params,  # 설정된 파라미터
    dtrain=train_data,  # 훈련 데이터
    num_boost_round=30,  # 부스팅 반복 횟수
    evals=[(valid_data, "Validation")],  # 검증 데이터
    early_stopping_rounds=10,  # 조기 종료 기준 라운드
    verbose_eval=True  # 학습 과정 출력 여부
)

# XGBoost predict
y_valid_pred = xgb_model.predict(valid_data)  # 검증 데이터에 대한 예측 (확률 출력)
y_valid_pred_class = np.argmax(y_valid_pred, axis=1)  # 확률이 가장 높은 클래스 선택

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)  # 정확도 계산
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")  # AUROC 계산 (One-vs-Rest)

print(f"acc: {accuracy}, auroc: {auroc}")  # 결과 출력


Parameters: { "n_estimators" } are not used.



[0]	Validation-mlogloss:1.37107
[1]	Validation-mlogloss:1.35713
[2]	Validation-mlogloss:1.34341
[3]	Validation-mlogloss:1.33067
[4]	Validation-mlogloss:1.31876
[5]	Validation-mlogloss:1.30775
[6]	Validation-mlogloss:1.29770
[7]	Validation-mlogloss:1.28861
[8]	Validation-mlogloss:1.28013
[9]	Validation-mlogloss:1.27223
[10]	Validation-mlogloss:1.26508
[11]	Validation-mlogloss:1.25781
[12]	Validation-mlogloss:1.25127
[13]	Validation-mlogloss:1.24448
[14]	Validation-mlogloss:1.23883
[15]	Validation-mlogloss:1.23346
[16]	Validation-mlogloss:1.22770
[17]	Validation-mlogloss:1.22194
[18]	Validation-mlogloss:1.21676
[19]	Validation-mlogloss:1.21172
[20]	Validation-mlogloss:1.20720
[21]	Validation-mlogloss:1.20306
[22]	Validation-mlogloss:1.19927
[23]	Validation-mlogloss:1.19546
[24]	Validation-mlogloss:1.19170
[25]	Validation-mlogloss:1.18860
[26]	Validation-mlogloss:1.18552
[27]	Validation-mlogloss:1.18273
[28]	Validation-mlogloss:1.18019
[29]	Validation-mlogloss:1.17716
acc: 0.4355022831050

In [33]:
# lgb predict
x_test = test_df.drop(["target", "ID"], axis = 1)
x_test = xgb.DMatrix(data=x_test)  # 훈련 데이터


y_test_pred = xgb_model.predict(x_test)
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [34]:
y_test_pred_class

array([1, 1, 2, ..., 1, 2, 2])

In [35]:
submission_df['target'] = y_test_pred_class
print(submission_df)

                       ID  target
0     2024-01-01 00:00:00       1
1     2024-01-01 01:00:00       1
2     2024-01-01 02:00:00       2
3     2024-01-01 03:00:00       2
4     2024-01-01 04:00:00       1
...                   ...     ...
2787  2024-04-26 03:00:00       2
2788  2024-04-26 04:00:00       2
2789  2024-04-26 05:00:00       1
2790  2024-04-26 06:00:00       2
2791  2024-04-26 07:00:00       2

[2792 rows x 2 columns]


In [36]:
submission_df.to_csv("/content/drive/MyDrive/AITech/W6_7_ML_Basic_PJ/submission/sample_submission_2_xgboost.csv")