In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import seaborn as sns



In [2]:
# 파일 호출
data_path: str = "data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)
df


Unnamed: 0,ID,target,_type
0,2023-01-01 00:00:00,2.0,train
1,2023-01-01 01:00:00,1.0,train
2,2023-01-01 02:00:00,1.0,train
3,2023-01-01 03:00:00,1.0,train
4,2023-01-01 04:00:00,2.0,train
...,...,...,...
2787,2024-04-26 03:00:00,,test
2788,2024-04-26 04:00:00,,test
2789,2024-04-26 05:00:00,,test
2790,2024-04-26 06:00:00,,test


In [3]:
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]
# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:01<00:00, 60.99it/s]


In [4]:
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape


(11552, 19)

In [5]:
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)
# category, continuous 열을 따로 할당해둠

In [6]:
exclude_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]+exclude_cols] +[
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
]



In [7]:
conti_cols

['coinbase_premium_gap',
 'coinbase_premium_index',
 'funding_rates',
 'long_liquidations',
 'long_liquidations_usd',
 'short_liquidations',
 'short_liquidations_usd',
 'open_interest',
 'buy_ratio',
 'buy_sell_ratio',
 'buy_volume',
 'sell_ratio',
 'sell_volume',
 'active_count',
 'receiver_count',
 'sender_count',
 'buy_sell_volume_ratio',
 'liquidation_diff',
 'liquidation_usd_diff',
 'volume_diff']

# check for interval using PACF



In [7]:
from statsmodels.graphics.tsaplots import plot_pacf
import matplotlib.pyplot as plt
eda=df[df['_type']=='train']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  # 2 rows, 2 columns of plots

plot_pacf(eda['buy_volume'], lags=50, ax=axes[0, 0])
axes[0, 0].set_title('PACF for Buy Volume')

plot_pacf(eda['sell_volume'], lags=50, ax=axes[0, 1])
axes[0, 1].set_title('PACF for Sell Volume')

plot_pacf(eda['sender_count'].dropna(), lags=50, ax=axes[1, 0])
axes[1, 0].set_title('PACF for longliquidations')

plot_pacf(eda['coinbase_premium_index'].dropna(), lags=50, ax=axes[1, 1])
axes[1, 1].set_title('PACF for shortliquidations')

plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'statsmodels'

In [8]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[ _ for _ in range(1,24)]
)
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

In [9]:
from typing import List
import pandas as pd

def rolling_feature(
   df: pd.DataFrame,
   conti_cols: List[str],
   intervals: List[int],
   funcs: List[str],
   **params,
) -> pd.DataFrame:
   """
   Create rolling features
   Args:
       df (pd.DataFrame): Sorted dataframe
       conti_cols (List[str]): continuous colnames
       intervals (List[str]): rolling window widths
       funcs (List[str]): aggregation functions e.g. ["mean", "median", "max"]
       **params: more input for rolling
   Returns:
       pd.DataFrame
   """
   df_rolling_list = [
       df[conti_col]
       .rolling(interval, **params)
       .agg({f"{conti_col}": func})
       .rename({conti_col: f"{conti_col}_{func}_{interval}"}, axis=1)
       for conti_col in conti_cols
       for interval in intervals
       for func in funcs
   ]
   return pd.concat(df_rolling_list, axis = 1)


In [10]:
#'block-interval_block_interval'
conti_cols = ["coinbase_premium_gap", "coinbase_premium_index",'buy_volume','sell_volume','volume_diff','liquidation_diff','funding_rates',
              'open_interest','buy_ratio','sell_ratio','active_count','sender_count',"receiver_count"]
rolling_df = rolling_feature(
   df = df,
   conti_cols=conti_cols,
   intervals=[ i for i in [12,24,168]],
   funcs=["mean", 'std'],
   min_periods = 1,
   closed = "left",
)

df=pd.concat([df, rolling_df], axis = 1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11552 entries, 0 to 11551
Columns: 486 entries, ID to volume_diff_23
dtypes: float64(481), int64(3), object(2)
memory usage: 42.8+ MB


In [10]:
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8760 entries, 0 to 8759
Columns: 485 entries, ID to volume_diff_23
dtypes: float64(481), int64(3), object(1)
memory usage: 32.5+ MB


In [12]:
# Step 1: Apply Window Slicing to Data Where Target is 0 or 3
df_filtered = train_df[(train_df['target'] == 0) | (train_df['target'] == 3)]
df_filtered
# Example window slicing function
# def window_slicing(time_series, window_size,step_size):
#     slices = []
#     for i in range(0,len(time_series) - window_size + 1,step_size):
#         window = time_series[i: i + window_size]
#         slices.append(window)
#     return np.array(slices)
def window_slicing(df: pd.DataFrame, window_size: int, step_size: int) -> pd.DataFrame:
    sliced_data = []
    
    for start in range(0, len(df) - window_size + 1, step_size):
        end = start + window_size
        window_df = df.iloc[start:end].copy()
        # window_df['window_start_index'] = start
        sliced_data.append(window_df)
    
    sliced_data_df = pd.concat(sliced_data, axis=0).reset_index(drop=True)
    return sliced_data_df

# Example window size (24 hours for 1 day window)
window_size = 24
step_size=12
sliced_data = window_slicing(df_filtered, window_size,step_size)
augmented_data=sliced_data
# augmented_data = pd.DataFrame()
# for i in sliced_data:
#     i_df = pd.DataFrame(i, columns=df_filtered.columns)
#     augmented_data = pd.concat([augmented_data, i_df], axis=0)
# augmented_data = augmented_data.astype(df_filtered.dtypes.to_dict())
# augmented_data.shape


In [13]:
# Step 2: Apply Noise Injection to the Sliced Data (excluding 'id' and 'target')
# def add_noise_to_data(df, noise_level=0.01):
#     df_noisy = df.copy()

#     # Exclude 'id' and 'target' columns from noise injectbion
#     feature_columns = df.columns.difference(['ID', 'target','_type'])
    
#     # Generate noise for the feature columns
#     noise = np.random.normal(0, noise_level, size=df[feature_columns].shape)
    
#     # Apply noise to feature columns
#     df_noisy[feature_columns] = df[feature_columns] + noise
    
#     return df_noisy
def noise_injection(df: pd.DataFrame, noise_level: float = 0.01, decay: float = 0.99, seed : int = 42) -> pd.DataFrame:
    df_noisy = df.copy()
    np.random.seed(seed)
    time_steps = np.arange(len(df_noisy))
    feature_columns = df.columns.difference(['ID', 'target','_type']) 
    for col in feature_columns:
        noise = np.random.normal(0, noise_level*df_noisy[col].std(), size=df_noisy[col].shape)
        decays = decay ** time_steps
        df_noisy[col] = df_noisy[col] + noise * decays
    
    return df_noisy

# Apply noise to the sliced data
augmented_noisy_data = noise_injection(augmented_data, noise_level=0.01,decay=0.99,seed= 42)

In [14]:
augmented_noisy_data.columns
train_1_2=train_df[(train_df['target'] != 0) & (train_df['target'] != 3)]
train_df_augmented=pd.concat([train_1_2,augmented_noisy_data])

In [15]:
train_df_augmented.shape

(11808, 485)

In [16]:

train_df_augmented['target'].value_counts()/len(train_df_augmented)

target
2.0    0.310891
1.0    0.300136
3.0    0.202405
0.0    0.186568
Name: count, dtype: float64

In [None]:
# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
# _target = df["target"]
# df = df.ffill().fillna(-999).assign(target = _target)

# # _type에 따라 train, test 분리
# train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
# test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [17]:
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df_augmented.drop(["target", "ID"], axis=1), 
    train_df_augmented["target"].astype(int), 
    test_size=0.2,
    random_state=42,
    # stratify=train_df["target"].astype(int)
)
x_train.shape

(9446, 483)

In [21]:
def train_lgmClassifier(x_train,x_valid,y_train,y_valid):
    

# lgb dataset
    train_data = lgb.Dataset(x_train, label=y_train)
    valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
    params = {
        "boosting_type": "gbdt",
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 4,
        "num_leaves": 50,
        "learning_rate": 0.05,
        "n_estimators": 90,
        "random_state": 42,
        "verbose": 0,
    }

# lgb train
    lgb_model = lgb.train(
        params=params,
        train_set=train_data,
        valid_sets=valid_data,
    )

# lgb predict
    y_valid_pred = lgb_model.predict(x_valid)
    y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
    accuracy = accuracy_score(y_valid, y_valid_pred_class)
    auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

    print(f"acc: {accuracy}, auroc: {auroc}")
    return lgb_model,accuracy,params
lgb_model,accuracy,params=train_lgmClassifier(x_train,x_valid,y_train,y_valid)



acc: 0.6041490262489416, auroc: 0.8525288252833623


In [26]:
importance = lgb_model.feature_importance()
feature_names=train_df_augmented.drop(["target", "ID"], axis = 1).columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
})

# Sort by importance (optional)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame
print(feature_importance_df[:50])
# a=feature_importance_df[:100]
# train_df[a.loc[:,'Feature']]

                   Feature  Importance
21            volume_diffg         480
20   liquidation_usd_diffg         424
19       liquidation_diffg         345
12             sell_volume         116
376         sender_count_9         115
10              buy_volume         111
357      receiver_count_13         100
356      receiver_count_12          96
18             volume_diff          94
359      receiver_count_15          92
388        sender_count_21          85
363      receiver_count_19          84
462          volume_diff_3          79
339        active_count_18          77
336        active_count_15          76
352       receiver_count_8          76
364      receiver_count_20          75
480         volume_diff_21          73
299          sell_volume_1          72
465          volume_diff_6          71
361      receiver_count_17          71
367      receiver_count_23          71
354      receiver_count_10          71
334        active_count_13          71
377        sender_count_1

In [23]:
x_train = train_df_augmented.drop(["target", "ID"], axis = 1)
y_train = train_df_augmented["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)



In [24]:
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [25]:
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)