In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import optuna

import seaborn as sns

In [2]:
# 파일 호출
data_path: str = "data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)
df

Unnamed: 0,ID,target,_type
0,2023-01-01 00:00:00,2.0,train
1,2023-01-01 01:00:00,1.0,train
2,2023-01-01 02:00:00,1.0,train
3,2023-01-01 03:00:00,1.0,train
4,2023-01-01 04:00:00,2.0,train
...,...,...,...
2787,2024-04-26 03:00:00,,test
2788,2024-04-26 04:00:00,,test
2789,2024-04-26 05:00:00,,test
2790,2024-04-26 06:00:00,,test


In [3]:
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]
# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:01<00:00, 65.32it/s]


In [7]:
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_network-data_velocity_velocity_supply_total" :'velocity',
    "hourly_network-data_transactions-count_transactions_count_total" : 'transaction_count',
    'hourly_network-data_hashrate_hashrate' : "hashrate"
}
# HOURLY_MARKET-DATA_PRICE-OHLCV_ALL_EXCHANGE_SPOT_BTC_USD

df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

KeyError: "['hourly_market-data_coinbase-premium-index_coinbase_premium_gap', 'hourly_market-data_coinbase-premium-index_coinbase_premium_index', 'hourly_market-data_funding-rates_all_exchange_funding_rates', 'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations', 'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd', 'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations', 'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd', 'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest', 'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio', 'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio', 'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume', 'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio', 'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume', 'hourly_network-data_addresses-count_addresses_count_active', 'hourly_network-data_addresses-count_addresses_count_receiver', 'hourly_network-data_addresses-count_addresses_count_sender', 'hourly_network-data_velocity_velocity_supply_total', 'hourly_network-data_transactions-count_transactions_count_total', 'hourly_network-data_hashrate_hashrate'] not in index"

In [8]:
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    
)
df['funding_open_interest']=df['funding_rates']*df['open_interest'] # mean 추가
df['volume_diff_pct_change_12h'] = df['volume_diff'].pct_change(periods=12)
df['volume_diff_pct_change_6h'] = df['volume_diff'].pct_change(periods=6)
df['volume_diff_pct_change_additive']=df['volume_diff_pct_change_6h']+df['volume_diff_pct_change_12h']


  df['volume_diff_pct_change_12h'] = df['volume_diff'].pct_change(periods=12)
  df['volume_diff_pct_change_6h'] = df['volume_diff'].pct_change(periods=6)


KeyError: 'Column not found: slope_7days'

In [None]:
df['mean_7days'] = df['volume_diff'].rolling(window=168).mean()  # 7 days if hourly data
df['mean_1day'] = df['volume_diff'].rolling(window=24).mean()    # 1 day if hourly data
df['mean_1month'] = df['volume_diff'].rolling(window=720).mean()  # 1 month if hourly data (30 days)
df['slope_7days'] = df['mean_7days'].diff()
df['slope_1day'] = df['mean_1day'].diff()
df['slope_1month'] = df['mean_1month'].diff()
# Calculate the differences in moving averages
df['mean_diff_7days_1day_volume'] = df['mean_7days'] - df['mean_1day']
df['mean_diff_1month_7days_volume'] = df['mean_1month'] - df['mean_7days']

df['mean_50'] = df['volume_diff'].rolling(window=50).mean()  # 50-period moving average
df['mean_100'] = df['volume_diff'].rolling(window=100).mean()

# Cross-signal generation
df['cross_volume_diff'] = np.where(df['mean_50'] > df['mean_100'], 1, 0)
df['cross_shifted_volume_diff'] = df['cross_volume_diff'].shift(1)
df['cross_signal_volume_diff'] = np.where(df['cross_volume_diff'] != df['cross_shifted_volume_diff'], 1, 0)

# # Rolling standard deviation for volatility
df['rolling_std_50_volume_diff'] = df['volume_diff'].rolling(window=50).std()
df['rolling_std_100_volume_diff'] = df['volume_diff'].rolling(window=100).std()

# # Drop intermediate moving averages
df.drop(columns=['mean_7days', 'mean_1day', 'mean_1month'], inplace=True)

df['mean_7days'] = df['liquidation_diff'].rolling(window=168).mean()  # 7 days if hourly data
df['mean_1day'] = df['liquidation_diff'].rolling(window=24).mean()    # 1 day if hourly data
df['mean_1month'] = df['liquidation_diff'].rolling(window=720).mean()  # 1 month if hourly data (30 days)

df['mean_diff_7days_1day_liquidation'] = df['mean_7days'] - df['mean_1day']
df['mean_diff_1month_7days_liquidation'] = df['mean_1month'] - df['mean_7days']

#Cross-signal generation
df['cross_liquidation_diff'] = np.where(df['mean_50'] > df['mean_100'], 1, 0)
df['cross_shifted_liquidation_diff'] = df['cross_liquidation_diff'].shift(1)
df['cross_signal_liquidation_diff'] = np.where(df['cross_liquidation_diff'] != df['cross_shifted_liquidation_diff'], 1, 0)

# # Rolling standard deviation for volatility
df['rolling_std_50_liquidation_diff'] = df['liquidation_diff'].rolling(window=50).std()
df['rolling_std_100_liquidation_diff'] = df['liquidation_diff'].rolling(window=100).std()

df.drop(columns=['mean_7days', 'mean_1day', 'mean_1month','mean_50','mean_100'], inplace=True)

In [54]:

df['transaction_rolling_mean_6h'] = df['transaction_count'].rolling(window=6).mean()
df['transaction_rolling_mean_12h'] = df['transaction_count'].rolling(window=12).mean()
df['transaction_rolling_mean_24h'] = df['transaction_count'].rolling(window=24).mean()
df['hashrate_mean_6h'] = df['hashrate'].rolling(window=6).mean() 
df['hashrate_mean_12h'] = df['hashrate'].rolling(window=12).mean() 
df['hashrate_mean_1d'] = df['hashrate'].rolling(window=24).mean() 
df['hashrate_mean_7d'] = df['hashrate'].rolling(window=168).mean()  # 168 hours = 7 days
df['transaction_rolling_mean_diff-12-6']=df['transaction_rolling_mean_12h']-df['transaction_rolling_mean_6h']
df['transaction_rolling_mean_diff-24-6']=df['transaction_rolling_mean_24h']-df['transaction_rolling_mean_6h']
df['hashrate_mean_diff-7d-1d']=df['hashrate_mean_7d']-df['hashrate_mean_1d']
df.drop(columns=['transaction_rolling_mean_6h','transaction_rolling_mean_12h','transaction_rolling_mean_24h',
                 'hashrate_mean_6h','hashrate_mean_12h','hashrate_mean_1d','hashrate_mean_7d'],inplace=True)


In [53]:
df.groupby('target')['velocity_rolling_mean_24h'].mean()-df.groupby('target')['velocity_rolling_mean_12h'].mean()



target
0.0    0.034351
1.0    0.000195
2.0   -0.033426
3.0    0.034741
dtype: float64

In [49]:
df.groupby('target')['transaction_per_active_address'].mean()

target
0.0    0.297981
1.0    0.312186
2.0    0.306367
3.0    0.302274
Name: transaction_per_active_address, dtype: float64

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Filter the dataframe for 'train' type
eda = df[df['_type'] == 'train'].drop(columns='_type')

# Convert 'ID' to datetime
eda['ID'] = pd.to_datetime(eda['ID'])

def assign_quarter(dt):
    month = dt.month
    if 1 <= month <= 3:
        return 'Q1'
    elif 4 <= month <= 6:
        return 'Q2'
    elif 7 <= month <= 9:
        return 'Q3'
    else:
        return 'Q4'
eda['Quarter']=eda['ID'].apply(assign_quarter)

eda['buy_volume_ma_3h'] = eda['buy_volume'].rolling(window=3).mean()
eda['buy_volume_std_3h'] = eda['buy_volume'].rolling(window=3).std()

eda['funding_rates_ma_3h'] = eda['funding_rates'].rolling(window=3).mean()
eda['funding_rates_std_3h'] = eda['funding_rates'].rolling(window=3).std()





eda_groupby=eda.groupby('target')

eda_groupby['price_close_pct_change_3h'].mean()


# Reset the index to bring 'target' back as a column for easier viewing

# Display the result




In [None]:



# # category, continuous 열을 따로 할당해둠
# df['cumulative_buy_volume'] = df['buy_volume'].cumsum()
# df['cumulative_sell_volume'] = df['sell_volume'].cumsum()
# df['buy_volume_rolling_mean'] = df['buy_volume'].rolling(window=5).mean()
# df['sell_volume_rolling_mean'] = df['sell_volume'].rolling(window=5).mean()
# df['buy_volume_vs_mean'] = df['buy_volume'] / df['buy_volume_rolling_mean']
# df['sell_volume_vs_mean'] = df['sell_volume'] / df['sell_volume_rolling_mean']
# df['buy_volume_momentum_1h'] = df['buy_volume'].pct_change(periods=1)
# df['sell_volume_momentum_1h'] = df['sell_volume'].pct_change(periods=1)

# df['buy_volume_momentum_6h'] = df['buy_volume'].pct_change(periods=6)
# df['sell_volume_momentum_6h'] = df['sell_volume'].pct_change(periods=6)
# df['buy_volume_shock'] = df['buy_volume'] > df['buy_volume_rolling_mean'] * 1.5
# df['sell_volume_shock'] = df['sell_volume'] > df['sell_volume_rolling_mean'] * 1.5
# df['net_buy_sell_volume'] = (df['buy_volume'] - df['sell_volume']).cumsum()
# df['buy_volume_short_term'] = df['buy_volume'].rolling(window=1).sum()
# df['buy_volume_long_term'] = df['buy_volume'].rolling(window=24).sum()

# df['sell_volume_short_term'] = df['sell_volume'].rolling(window=1).sum()
# df['sell_volume_long_term'] = df['sell_volume'].rolling(window=24).sum()

# # Difference in buy/sell pressure over short vs long term
# df['buy_volume_pressure_diff'] = df['buy_volume_short_term'] - df['buy_volume_long_term']
# df['sell_volume_pressure_diff'] = df['sell_volume_short_term'] - df['sell_volume_long_term']
# df['price_direction'] = (df['close'].shift(-1) > df['close']).astype(int)
# df['price_return'] = df['close'].pct_change()
# df['lag_1'] = df['close'].shift(1)  # 1-hour lag
# df['lag_6'] = df['close'].shift(6)  # 6-hour lag
# df['lag_24'] = df['close'].shift(24)  # 24-hour lag
# df['MA_5'] = df['close'].rolling(window=5).mean()
# df['MA_20'] = df['close'].rolling(window=20).mean()

# # Rolling standard deviation (volatility)
# df['volatility_5'] = df['close'].rolling(window=5).std()
# df['volatility_20'] = df['close'].rolling(window=20).std()
# window_length = 14

# delta = df['close'].diff()
# gain = (delta.where(delta > 0, 0)).rolling(window=window_length).mean()
# loss = (-delta.where(delta < 0, 0)).rolling(window=window_length).mean()

# rs = gain / loss
# df['RSI'] = 100 - (100 / (1 + rs))

# df['ROC_10'] = ((df['close'] - df['close'].shift(10)) / df['close'].shift(10)) * 100
# df['+DM'] = df['close'].diff().where(df['close'].diff() > 0, 0)
# df['-DM'] = -df['close'].diff().where(df['close'].diff() < 0, 0)

# # Calculate the directional index
# df['+DI'] = df['+DM'].rolling(window=14).mean()
# df['-DI'] = df['-DM'].rolling(window=14).mean()

# # ADX
# df['ADX'] = (df['+DI'] - df['-DI']).abs() / (df['+DI'] + df['-DI']) * 100