# Base code

모든 feature를 사용함

특별한 엔지니어링을 적용하지 않음

lightBGM 모델을 사용함(초기 base 코드의 하이퍼파라미터를 그대로 사용함, random seed만 0으로 변경함)

python 3.11.9 사용

In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 데이터 확인하기

In [2]:
dir_path = "./data"

train_df = pd.read_csv(f"{dir_path}/train.csv")
test_df = pd.read_csv(f"{dir_path}/test.csv")
print(train_df.shape)
print(test_df.shape)

(8760, 2)
(2792, 1)


In [3]:
file_names = glob(f"{dir_path}/HOURLY_*.csv")
print(len(file_names))    # data 개수

pd.read_csv(file_names[0]).head()    # 0번째 data 확인

107


Unnamed: 0,datetime,long_liquidations,short_liquidations,long_liquidations_usd,short_liquidations_usd
0,2024-04-25 02:00:00,0.259,0.0,16558.94034,0.0
1,2024-04-25 01:00:00,0.716,0.0,45922.12276,0.0
2,2024-04-25 00:00:00,0.0,2.6431,0.0,170655.56551
3,2024-04-24 23:00:00,3.0798,0.425,196683.20466,27302.37792
4,2024-04-24 22:00:00,0.1714,0.0,10974.68367,0.0


In [4]:
# 다른 파일에 동일한 칼럼명으로 된 경우가 있는것을 확인함
for i in range(5):
    print(file_names[i])
    print(pd.read_csv(file_names[i]).columns.tolist())

./data/HOURLY_MARKET-DATA_LIQUIDATIONS_GATE_IO_ALL_SYMBOL.csv
['datetime', 'long_liquidations', 'short_liquidations', 'long_liquidations_usd', 'short_liquidations_usd']
./data/HOURLY_MARKET-DATA_OPEN-INTEREST_FTX_BTC_USD.csv
['datetime', 'open_interest']
./data/HOURLY_MARKET-DATA_OPEN-INTEREST_BYBIT.csv
['datetime', 'open_interest']
./data/HOURLY_MARKET-DATA_FUNDING-RATES_BYBIT.csv
['datetime', 'funding_rates']
./data/HOURLY_MARKET-DATA_LIQUIDATIONS_HTX_GLOBAL_ALL_SYMBOL.csv
['datetime', 'long_liquidations', 'short_liquidations', 'long_liquidations_usd', 'short_liquidations_usd']


In [5]:
# ID 와 target 데이터프레임
df = pd.concat([train_df, test_df], axis=0)
df

Unnamed: 0,ID,target
0,2023-01-01 00:00:00,2.0
1,2023-01-01 01:00:00,1.0
2,2023-01-01 02:00:00,1.0
3,2023-01-01 03:00:00,1.0
4,2023-01-01 04:00:00,2.0
...,...,...
2787,2024-04-26 03:00:00,
2788,2024-04-26 04:00:00,
2789,2024-04-26 05:00:00,
2790,2024-04-26 06:00:00,


In [6]:
df = pd.concat([train_df, test_df], axis=0)

total_col_counts = 1    # 총 칼럼 수(+ ID)
for i in tqdm(range(len(file_names))):
    dff = pd.read_csv(file_names[i])

    # columns 변경
    fname = file_names[i].split('/')[-1].split('.')[0].lower()   # 소문자 file name
    new_col = dff.columns.tolist()[1:]
    total_col_counts += len(new_col)
    new_col = ['ID'] + [f"{fname}_{col}" for col in new_col]
    dff.columns = new_col

    df = df.merge(dff, on='ID', how='left')

df.reset_index(drop=True, inplace=True)

print("total col counts: ", total_col_counts)
print(df.shape)
df

100%|██████████| 107/107 [00:03<00:00, 34.44it/s]

total col counts:  253
(11552, 254)





Unnamed: 0,ID,target,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations_usd,hourly_market-data_open-interest_ftx_btc_usd_open_interest,hourly_market-data_open-interest_bybit_open_interest,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,...,hourly_market-data_liquidations_ftx_btc_usd_short_liquidations_usd,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usdt_open_interest
0,2023-01-01 00:00:00,2.0,0.0,0.0,0.0,0.0,,,0.01,0.0,...,,0.0,0.0,0.0,0.0,0.000,0.000,0.0000,0.0000,6.784288e+07
1,2023-01-01 01:00:00,1.0,0.0,0.0,0.0,0.0,,,0.01,0.0,...,,0.0,0.0,0.0,0.0,0.000,0.000,0.0000,0.0000,6.788941e+07
2,2023-01-01 02:00:00,1.0,0.0,0.0,0.0,0.0,,,0.01,0.0,...,,0.0,0.0,0.0,0.0,0.000,0.000,0.0000,0.0000,6.781657e+07
3,2023-01-01 03:00:00,1.0,0.0,0.0,0.0,0.0,,,0.01,0.0,...,,0.0,0.0,0.0,0.0,0.000,0.000,0.0000,0.0000,6.798192e+07
4,2023-01-01 04:00:00,2.0,0.0,0.0,0.0,0.0,,,0.01,0.0,...,,0.0,0.0,0.0,0.0,0.000,0.000,0.0000,0.0000,6.829002e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,,,,,,,,,...,,,,,,0.138,0.034,8852.6173,2197.9436,9.916199e+07
11548,2024-04-26 04:00:00,,,,,,,,,,...,,,,,,1.375,0.106,88004.5038,6836.7439,9.890358e+07
11549,2024-04-26 05:00:00,,,,,,,,,,...,,,,,,0.378,1.400,24154.4771,90469.5577,9.913138e+07
11550,2024-04-26 06:00:00,,,,,,,,,,...,,,,,,0.276,0.888,17703.9134,57387.5878,9.933858e+07


In [7]:
df.columns

Index(['ID', 'target',
       'hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations',
       'hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations',
       'hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations_usd',
       'hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations_usd',
       'hourly_market-data_open-interest_ftx_btc_usd_open_interest',
       'hourly_market-data_open-interest_bybit_open_interest',
       'hourly_market-data_funding-rates_bybit_funding_rates',
       'hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations',
       ...
       'hourly_market-data_liquidations_ftx_btc_usd_short_liquidations_usd',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd',
       'hourly_market-data_liquidat

# Train

In [8]:
train_df = df[df['ID'].isin(train_df['ID'])]    # df에서 train 만 추출
print("train df shape: ", train_df.shape)

train_y = train_df['target']    # fetures
train_x = train_df.drop(['ID', 'target'], axis=1)    # target

x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=0)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 0,
    "verbose": 0,
}

# Train
model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# Val
y_valid_pred = model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

accuracy = accuracy_score(y_valid, y_valid_pred_class)
print(f"acc: {accuracy}")



train df shape:  (8760, 254)
(7008, 252) (7008,)
(1752, 252) (1752,)
acc: 0.4469178082191781


## Predict

In [9]:
test_df = df[df['ID'].isin(test_df['ID'])]    # df에서 test 만 추출

y_test_pred = model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

submission_df = pd.read_csv(f"{dir_path}/test.csv")
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)

submission_df

Unnamed: 0,ID,target
0,2024-01-01 00:00:00,1
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,1
3,2024-01-01 03:00:00,2
4,2024-01-01 04:00:00,1
...,...,...
2787,2024-04-26 03:00:00,0
2788,2024-04-26 04:00:00,2
2789,2024-04-26 05:00:00,0
2790,2024-04-26 06:00:00,2


In [10]:
submission_df.target.value_counts()

target
1    1576
2     709
3     326
0     181
Name: count, dtype: int64