In [None]:
!tar -xzvf "/content/data.tar.gz"

._data
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.macl'
data/
data/HOURLY_MARKET-DATA_LIQUIDATIONS_GATE_IO_ALL_SYMBOL.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_FTX_BTC_USD.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_BYBIT.csv
data/HOURLY_MARKET-DATA_FUNDING-RATES_BYBIT.csv
data/HOURLY_MARKET-DATA_LIQUIDATIONS_HTX_GLOBAL_ALL_SYMBOL.csv
data/HOURLY_MARKET-DATA_LIQUIDATIONS_BINANCE_BTC_USDT.csv
data/HOURLY_MARKET-DATA_LIQUIDATIONS_BITMEX_BTC_USD.csv
data/HOURLY_NETWORK-DATA_BLOCK-BYTES.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_BINANCE_BTC_BUSD.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_KRAKEN.csv
data/HOURLY_MARKET-DATA_LIQUIDATIONS_GATE_IO_BTC_USD.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_OKX.csv
data/HOURLY_NETWORK-DATA_BLOCK-COUNT.csv
data/HOURLY_MARKET-DATA_LIQUIDATIONS_OKEX_BTC_USD.csv
data/HOURLY_NETWORK-DATA_UTXO-COUNT.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_DERIBIT.csv
data/HOURLY_MARKET-DATA_OPEN-INTEREST_DERIBIT_BTC_USD.csv
data/._.DS_Store
tar: Ign

In [None]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, GRU, Conv1D, Dense, Dropout, GlobalMaxPooling1D, concatenate
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

import seaborn as sns

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# 파일 호출
data_path: str = "data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [None]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:02<00:00, 36.45it/s]


In [None]:
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    'hourly_network-data_hashrate_hashrate': "hashrate_value",
    'hourly_network-data_transactions-count_transactions_count_total': 'transaction_count',
    'hourly_network-data_velocity_velocity_supply_total': 'velocity_count'
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape


(11552, 22)

In [None]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)

df['buy_volume_ma_3h'] = df['buy_volume'].rolling(window=3).mean()
df['buy_volume_std_3h'] = df['buy_volume'].rolling(window=3).std()

df['funding_rates_ma_3h'] = df['funding_rates'].rolling(window=3).mean()
df['funding_rates_std_3h'] = df['funding_rates'].rolling(window=3).std()

df['price_close_pct_change_1h'] = df['coinbase_premium_gap'].pct_change(periods=1)
df['price_close_pct_change_3h'] = df['coinbase_premium_gap'].pct_change(periods=3)

df['buy_volume_pct_change_1h'] = df['buy_volume'].pct_change(periods=1)
df['buy_volume_pct_change_3h'] = df['buy_volume'].pct_change(periods=3)

df['liquidation_diff_pct_change_3h'] = df['liquidation_diff'].pct_change(periods=3)

df['buy_sell_ratio_pct_change_3h'] = df['buy_sell_ratio'].pct_change(periods=3)
df['is_buy_dominant'] = (df['buy_sell_ratio'] > 1.0).astype(int)


# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio", "liquidation_diff", "liquidation_usd_diff", "volume_diff",
    "buy_volume_ma_3h", "buy_volume_std_3h", "funding_rates_ma_3h", "funding_rates_std_3h",
    "price_close_pct_change_1h", "price_close_pct_change_3h", "buy_volume_pct_change_1h", "buy_volume_pct_change_3h",
    "liquidation_diff_pct_change_3h", "buy_sell_ratio_pct_change_3h", "is_buy_dominant",
]

  df['price_close_pct_change_1h'] = df['coinbase_premium_gap'].pct_change(periods=1)
  df['price_close_pct_change_3h'] = df['coinbase_premium_gap'].pct_change(periods=3)
  df['buy_volume_pct_change_1h'] = df['buy_volume'].pct_change(periods=1)
  df['buy_volume_pct_change_3h'] = df['buy_volume'].pct_change(periods=3)
  df['buy_sell_ratio_pct_change_3h'] = df['buy_sell_ratio'].pct_change(periods=3)


In [None]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [None]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])


In [None]:
print(train_df.head())

                    ID  target  coinbase_premium_gap  coinbase_premium_index  \
0  2023-01-01 00:00:00     2.0                 -9.86               -0.059650   
1  2023-01-01 01:00:00     1.0                 -8.78               -0.053047   
2  2023-01-01 02:00:00     1.0                 -9.59               -0.057952   
3  2023-01-01 03:00:00     1.0                 -9.74               -0.058912   
4  2023-01-01 04:00:00     2.0                -10.14               -0.061373   

   funding_rates  long_liquidations  long_liquidations_usd  \
0       0.005049              0.012              197.51610   
1       0.005049              0.000                0.00000   
2       0.005049              0.000                0.00000   
3       0.005067              0.593             9754.76891   
4       0.006210              0.361             5944.43714   

   short_liquidations  short_liquidations_usd  open_interest  ...  \
0               0.000                 0.00000   6.271344e+09  ...   
1       

In [None]:
def window_slicing(df: pd.DataFrame, window_size: int, step_size: int) -> pd.DataFrame:
    sliced_data = []

    for start in range(0, len(df) - window_size + 1, step_size):
        end = start + window_size
        window_df = df.iloc[start:end].copy()
        # window_df['window_start_index'] = start
        sliced_data.append(window_df)

    sliced_data_df = pd.concat(sliced_data, axis=0).reset_index(drop=True)
    return sliced_data_df

window_size = 24
step_size = 12

train_0_3 = train_df[train_df['target'].isin([0, 3])]
train_1_2 = train_df[~train_df['target'].isin([0, 3])]

augm_0_3 = window_slicing(train_0_3, window_size=window_size, step_size=step_size)

#train_df_aug_window = pd.concat([train_1_2, augm_0_3], axis=0)

In [None]:
def noise_injection(df: pd.DataFrame, noise_level: float = 0.01, decay: float = 0.99, seed:int = 42) -> pd.DataFrame:
    np.random.seed(seed)
    df_noisy = df.copy()
    time_steps = np.arange(len(df_noisy))
    for col in df_noisy.select_dtypes(include=[np.number]).columns:
        noise = np.random.normal(0, noise_level*df_noisy[col].std(), size=df_noisy[col].shape)
        decays = decay ** time_steps
        df_noisy[col] = df_noisy[col] + noise * decays

    return df_noisy

#train03 = train_df[train_df['target'].isin([0, 3])]

df_noisy = noise_injection(augm_0_3.drop(columns=['target', 'ID']))
df_noisy = df_noisy.reset_index(drop=True)
augm03 = augm_0_3.reset_index(drop=True)
df_noisy['target'] = augm03['target'].values
df_noisy['ID'] = augm03['ID'].values

train_df_augm = pd.concat([train_1_2, df_noisy], axis=0)


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  sqr = _ensure_numeric((avg - values) ** 2)


In [None]:
print(len(df_noisy))  # df_noisy의 길이 확인
# print(len(train03))

3048


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11552 entries, 0 to 11551
Columns: 512 entries, ID to receiver_count_std_68
dtypes: float64(507), int64(3), object(2)
memory usage: 45.1+ MB


In [None]:
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8760 entries, 0 to 8759
Columns: 511 entries, ID to receiver_count_std_68
dtypes: float64(507), int64(3), object(1)
memory usage: 34.2+ MB


In [None]:
train_df

Unnamed: 0,ID,target,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,long_liquidations_usd,short_liquidations,short_liquidations_usd,open_interest,...,buy_ratio_mean_68,buy_ratio_std_68,sell_ratio_mean_68,sell_ratio_std_68,active_count_mean_68,active_count_std_68,sender_count_mean_68,sender_count_std_68,receiver_count_mean_68,receiver_count_std_68
0,2023-01-01 00:00:00,2.0,-9.86,-0.059650,0.005049,0.012000,1.975161e+02,0.0000,0.000000e+00,6.271344e+09,...,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000
1,2023-01-01 01:00:00,1.0,-8.78,-0.053047,0.005049,0.000000,0.000000e+00,0.7120,1.183356e+04,6.288683e+09,...,0.475003,-999.000000,0.524997,-999.000000,67987.000000,-999.000000,37307.000000,-999.000000,37752.000000,-999.000000
2,2023-01-01 02:00:00,1.0,-9.59,-0.057952,0.005049,0.000000,0.000000e+00,0.0000,0.000000e+00,6.286796e+09,...,0.549229,0.104971,0.450771,0.104971,49290.000000,26441.550976,24824.500000,17652.920792,29143.000000,12174.964558
3,2023-01-01 03:00:00,1.0,-9.74,-0.058912,0.005067,0.593000,9.754769e+03,0.0000,0.000000e+00,6.284575e+09,...,0.535081,0.078166,0.464919,0.078166,44159.000000,20701.673652,22462.000000,13136.096262,25885.000000,10293.618071
4,2023-01-01 04:00:00,2.0,-10.14,-0.061373,0.006210,0.361000,5.944437e+03,0.0000,0.000000e+00,6.291582e+09,...,0.517940,0.072447,0.482060,0.072447,41298.500000,17844.776145,19701.750000,12062.915772,25363.500000,8469.173376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,1.0,-27.10,-0.063587,0.036371,0.163000,6.924677e+03,0.1040,4.437410e+03,1.054230e+10,...,0.494227,0.051687,0.505773,0.051687,54661.279412,18325.810803,37880.191176,14646.296444,19507.852941,6723.192854
8756,2023-12-31 20:00:00,1.0,-24.73,-0.058109,0.037233,29.698896,1.263031e+06,43.8396,1.870481e+06,1.051484e+10,...,0.494465,0.051799,0.505535,0.051799,55017.764706,18459.927189,38216.250000,14729.725433,19536.117647,6737.071317
8757,2023-12-31 21:00:00,0.0,-28.48,-0.066979,0.037761,0.325000,1.385218e+04,1.6790,7.179552e+04,1.048598e+10,...,0.495453,0.051650,0.504547,0.051650,55185.132353,18231.532472,38413.794118,14447.962001,19504.838235,6763.046489
8758,2023-12-31 22:00:00,2.0,-9.08,-0.021487,0.038020,90.293123,3.815777e+06,7.7816,3.310213e+05,1.032844e+10,...,0.496182,0.051569,0.503818,0.051569,54544.264706,18342.476238,37963.308824,14565.333685,19270.764706,6744.871874


In [None]:
X_train = train_df.drop(["target", "ID"], axis=1).values
y_train = train_df["target"].values

X_test = test_df.drop(["target", "ID"], axis=1).values

# 데이터 스케일링

scaler = MinMaxScaler()
X_train = np.nan_to_num(X_train) # replace inf and nan values with finite values
X_train_scaled = scaler.fit_transform(X_train)
X_test = np.nan_to_num(X_test) # replace inf and nan values with finite values in X_test
X_test_scaled = scaler.transform(X_test)
# 시계열 데이터를 LSTM input shape에 맞게 변환 (timesteps 사용)
timesteps = 10
X_train_seq, y_train_seq = [], []

for i in range(timesteps, len(X_train_scaled)):
    X_train_seq.append(X_train_scaled[i - timesteps:i])
    y_train_seq.append(y_train[i])

X_train_seq, y_train_seq = np.array(X_train_seq), np.array(y_train_seq)

# train, validation split
X_train_seq, X_valid_seq, y_train_seq, y_valid_seq = train_test_split(
    X_train_seq, y_train_seq, test_size=0.2, random_state=42
)

def create_lstm_model(input_shape):
    inputs = Input(shape=input_shape)
    x = LSTM(units=50, return_sequences=True)(inputs)
    x = Dropout(0.2)(x)
    x = LSTM(units=50, return_sequences=False)(x)
    x = Dropout(0.2)(x)
    return Model(inputs=inputs, outputs=x)

# GRU 모델 정의
def create_gru_model(input_shape):
    inputs = Input(shape=input_shape)
    x = GRU(units=50, return_sequences=True)(inputs)
    x = Dropout(0.2)(x)
    x = GRU(units=50, return_sequences=False)(x)
    x = Dropout(0.2)(x)
    return Model(inputs=inputs, outputs=x)

# 1D CNN 모델 정의
def create_cnn_model(input_shape):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs)
    x = Dropout(0.2)(x)
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = GlobalMaxPooling1D()(x)
    return Model(inputs=inputs, outputs=x)

# 앙상블 모델 생성
input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])

lstm_model = create_lstm_model(input_shape)
gru_model = create_gru_model(input_shape)
cnn_model = create_cnn_model(input_shape)

# 각 모델의 출력을 결합
combined = concatenate([lstm_model.output, gru_model.output, cnn_model.output])

# 최종 출력층
output = Dense(units=4, activation='softmax')(combined)

# 앙상블 모델 정의
ensemble_model = Model(inputs=[lstm_model.input, gru_model.input, cnn_model.input], outputs=output)

# 모델 컴파일
ensemble_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
ensemble_model.fit(
    [X_train_seq, X_train_seq, X_train_seq],
    y_train_seq,
    validation_data=([X_valid_seq, X_valid_seq, X_valid_seq], y_valid_seq),
    epochs=20,
    batch_size=32
)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  data_range = data_max - data_min
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Epoch 1/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - accuracy: 0.4078 - loss: 1.2573 - val_accuracy: 0.4171 - val_loss: 1.1653
Epoch 2/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4173 - loss: 1.1813 - val_accuracy: 0.4171 - val_loss: 1.1698
Epoch 3/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.4099 - loss: 1.1755 - val_accuracy: 0.4171 - val_loss: 1.1612
Epoch 4/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.4265 - loss: 1.1447 - val_accuracy: 0.4080 - val_loss: 1.1503
Epoch 5/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4137 - loss: 1.1527 - val_accuracy: 0.4171 - val_loss: 1.1525
Epoch 6/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4277 - loss: 1.1539 - val_accuracy: 0.4177 - val_loss: 1.1438
Epoch 7/20
[1m219/21

<keras.src.callbacks.history.History at 0x7dca12ff8f70>

In [None]:
X_test_seq = []

for i in range(timesteps, len(X_test_scaled)):
    X_test_seq.append(X_test_scaled[i - timesteps:i])

X_test_seq = np.array(X_test_seq)
# 테스트 데이터에 대한 예측
y_test_pred = ensemble_model.predict([X_test_seq, X_test_seq, X_test_seq])

# 예측한 값을 클래스 범주로 변환
y_test_pred_class = np.argmax(y_test_pred, axis=1).astype(int)

[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


In [None]:
len(y_test_pred_class)

2782

In [None]:
y_test_pred_class = np.concatenate((y_train[:timesteps], y_test_pred_class))

In [None]:
submission_df["target"] = y_test_pred_class
submission_df["target"] = submission_df["target"].astype(int)
submission_df.to_csv("lstm_output.csv", index=False)


In [None]:
print(submission_df["target"].dtype)

int64


In [None]:
pd.read_csv("lstm_output.csv")

Unnamed: 0,ID,target
0,2024-01-01 00:00:00,2
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,1
3,2024-01-01 03:00:00,1
4,2024-01-01 04:00:00,2
...,...,...
2787,2024-04-26 03:00:00,1
2788,2024-04-26 04:00:00,1
2789,2024-04-26 05:00:00,1
2790,2024-04-26 06:00:00,1
