In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
# 파일 호출
data_path: str = "data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)
df


Unnamed: 0,ID,target,_type
0,2023-01-01 00:00:00,2.0,train
1,2023-01-01 01:00:00,1.0,train
2,2023-01-01 02:00:00,1.0,train
3,2023-01-01 03:00:00,1.0,train
4,2023-01-01 04:00:00,2.0,train
...,...,...,...
2787,2024-04-26 03:00:00,,test
2788,2024-04-26 04:00:00,,test
2789,2024-04-26 05:00:00,,test
2790,2024-04-26 06:00:00,,test


In [3]:
file_names : List[str]=[
    f for f in os.listdir(data_path) if f.startswith('HOURLY_') and f.endswith(".csv")
]

file_dict: Dict[str,pd.DataFrame]={
    f.replace(".csv",""): pd.read_csv(os.path.join(data_path,f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    _rename_rule={
        col: f"{_file_name.lower()}_{col.lower()}" if col!="datetime" else "ID"
        for col in _df.columns
    }
    _df=_df.rename(_rename_rule,axis=1)
    df=df.merge(_df,on="ID",how="left")


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 107/107 [00:01<00:00, 80.81it/s]


In [None]:
df.shape

In [4]:
df
missing_values=df.isnull().sum()
missing_values
missing_percentage= (missing_values)/len(df) *100
missing_percentage

sorted_missing_percentage=missing_percentage.sort_values(ascending=False)

# null 100 percent인 feature들
name_null=[]
for col, val in sorted_missing_percentage.items():
    if val==100:
        name_null.append(col)        
len(name_null)
for i in name_null:
    df=df.drop(columns=i)
df.shape

eda_df=df.loc[df["_type"]=="train"]
eda_test=df.loc[df["_type"]=="test"]

In [9]:
eda_df

Unnamed: 0,ID,target,_type,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations_usd,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations,...,hourly_market-data_taker-buy-sell-stats_binance_taker_buy_sell_ratio,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usdt_open_interest
0,2023-01-01 00:00:00,2.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,0.893584,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,6.784288e+07
1,2023-01-01 01:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.507117,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,6.788941e+07
2,2023-01-01 02:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,0.965079,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,6.781657e+07
3,2023-01-01 03:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,0.810051,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,6.798192e+07
4,2023-01-01 04:00:00,2.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.009602,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,6.829002e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.024847,0.000000,0.000,...,1.097161,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,8.510244e+07
8756,2023-12-31 20:00:00,1.0,train,0.120000,0.3536,5110.39200,15079.53570,0.023523,2.441000,2.394,...,1.090461,2.441,2.394,103766.3419,102175.4826,2.441,2.394,103766.3419,102175.4826,8.472883e+07
8757,2023-12-31 21:00:00,0.0,train,0.000000,0.0200,0.00000,853.17000,0.022368,0.003000,0.368,...,1.071186,0.003,0.368,127.3455,15757.4003,0.003,0.368,127.3455,15757.4003,8.469083e+07
8758,2023-12-31 22:00:00,2.0,train,6.830952,0.0106,288593.42160,450.50424,0.021547,6.266413,0.175,...,0.713756,5.702,0.175,241010.3290,7435.0124,5.702,0.175,241010.3290,7435.0124,8.278992e+07


In [None]:
# import seaborn as sns
# eda_df['hourly_network-data_block-bytes_block_bytes'].describe()

In [None]:
# sns.histplot(x='hourly_network-data_block-bytes_block_bytes',data=eda_df)
# min_val=eda_df['hourly_network-data_block-bytes_block_bytes'].quantile(0.05)
# max_val=eda_df['hourly_network-data_block-bytes_block_bytes'].quantile(0.95)

### 눌값 mean으로 채우기


In [10]:

# Calculate missing values and percentages
missing_values = eda_df.isnull().sum()
missing_percentage = (missing_values / len(eda_df)) * 100
sorted_missing_percentage = missing_percentage.sort_values(ascending=False)

# Identify columns with missing values
null = [col for col in eda_df.columns if missing_values[col] != 0]

# Fill missing values with the mean of each column
for col in null:
    eda_df.loc[:, col] = eda_df[col].fillna(eda_df[col].mean())

### 클립하기!

In [12]:

eda_df_before = eda_df.drop(columns=['ID', '_type', 'target'])

# Calculate 5th and 95th percentiles for each column
eda_df_5percent = eda_df_before.quantile(0.05)
eda_df_95percent = eda_df_before.quantile(0.95)

# Clip values between 1.5 * 5th and 1.5 * 95th percentiles
for col in eda_df.columns:
    if col not in ['ID', '_type', 'target']:
        # Use .loc to safely modify the DataFrame and avoid SettingWithCopyWarning
        eda_df.loc[:, col] = np.clip(eda_df[col], 1.5 * eda_df_5percent[col], 1.5 * eda_df_95percent[col])

# Store the modified DataFrame in eda_after
eda_after = eda_df
eda_after


Unnamed: 0,ID,target,_type,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations,hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations_usd,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations,...,hourly_market-data_taker-buy-sell-stats_binance_taker_buy_sell_ratio,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usdt_open_interest
0,2023-01-01 00:00:00,2.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
1,2023-01-01 01:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
2,2023-01-01 02:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
3,2023-01-01 03:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
4,2023-01-01 04:00:00,2.0,train,0.000000,0.0000,0.00000,0.00000,0.010000,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,1.0,train,0.000000,0.0000,0.00000,0.00000,0.021585,0.000000,0.000,...,1.666446,0.000,0.000,0.0000,0.0000,0.000,0.000,0.0000,0.0000,1.180298e+08
8756,2023-12-31 20:00:00,1.0,train,0.120000,0.3536,5110.39200,15079.53570,0.021585,2.441000,2.394,...,1.666446,2.441,2.394,103766.3419,102175.4826,2.441,2.394,103766.3419,102175.4826,1.180298e+08
8757,2023-12-31 21:00:00,0.0,train,0.000000,0.0200,0.00000,853.17000,0.021585,0.003000,0.368,...,1.666446,0.003,0.368,127.3455,15757.4003,0.003,0.368,127.3455,15757.4003,1.180298e+08
8758,2023-12-31 22:00:00,2.0,train,6.830952,0.0106,288593.42160,450.50424,0.021547,6.266413,0.175,...,1.666446,5.702,0.175,241010.3290,7435.0124,5.702,0.175,241010.3290,7435.0124,1.180298e+08


In [13]:
eda_after.info()
eda_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8760 entries, 0 to 8759
Columns: 215 entries, ID to hourly_market-data_open-interest_htx_global_btc_usdt_open_interest
dtypes: float64(213), object(2)
memory usage: 14.4+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 2792 entries, 8760 to 11551
Columns: 215 entries, ID to hourly_market-data_open-interest_htx_global_btc_usdt_open_interest
dtypes: float64(210), int64(3), object(2)
memory usage: 4.6+ MB


In [None]:
# features for 
# cols_dict: Dict[str,str] = {
#     "ID" : "ID",
#     "target" : "target",
#     "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
#     "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
#     "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
#     "hourly_network-data_addresses-count_addresses_count_active": "addresses_count_active",
#     "hourly_network-data_addresses-count_addresses_count_sender": "addresses_count_sender",
#     "hourly_network-data_addresses-count_addresses_count_receiver": "addresses_count_receiver",
#     "hourly_network-data_blockreward_blockreward": "blockreward",
#     "hourly_network-data_fees_fees_total" : "fees_total",
#     "hourly_network-data_hashrate_hashrate": "hashrate",
#     "hourly_network-data_supply_supply_new": "supply_new",
#     "hourly_network-data_velocity_velocity_supply_total": "velocity_supply_total ",
# }

# eda_df=eda_df[cols_dict.keys()].rename(cols_dict, axis=1)

# eda_df.shape

## raw 데이터 for EDA 하실분

In [None]:
group_eda=df.loc[df["_type"]=="train"].groupby("target")

In [None]:
# import seaborn as sns
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# eda_df_market['ID']=pd.to_datetime(eda_df_market['ID'], format='%Y-%m-%d %H:%M:%S')
# eda_df_market.columns

In [None]:
# dic={}
# for target, dataframe in group_market:
#     dataframe['ID']=pd.to_datetime(dataframe['ID'], format='%Y-%m-%d %H:%M:%S')
#     dataframe_resample=dataframe.resample('30D', on='ID').mean()
#     dic[target]=dataframe_resample
# df1=dic[0]
# df2=dic[1]
# df3=dic[2]
# df4=dic[3]


# fig, ax = plt.subplots(figsize=(10, 6))

# # Plot each dataframe's 'long_liquidations' on the same plot
# sns.lineplot(data=df1, x='ID', y='long_liquidations', ax=ax, label='target = 0')
# sns.lineplot(data=df2, x='ID', y='long_liquidations', ax=ax, label='target = 1')
# sns.lineplot(data=df3, x='ID', y='long_liquidations', ax=ax, label='target = 2')
# sns.lineplot(data=df4, x='ID', y='long_liquidations', ax=ax, label='target = 3')

# # Add titles and labels
# plt.title("Comparison of Long Liquidations Across Multiple Dataframes")
# plt.xlabel('Date')  # Change to a meaningful label
# plt.ylabel('Long Liquidations')  # Label for the y-axis

# # Show the plot with a legend
# plt.legend()  # Displays the labels for each dataframe
# plt.show()

In [None]:
# sns.pairplot(data=eda_df_market, hue='target',palette="Set2")

In [None]:
# bar_market = group_market.agg({
#     "buy_volume": "mean",
#     "sell_volume": "mean",
#     "short_liquidations": "mean",
#     "long_liquidations": "mean",
#     "coinbase_premium_gap": "mean",
#     "funding_rates": "mean",
#     "open_interest": "mean"
# })
# bar_market

In [None]:
# fig,axes=plt.subplots(1,1,figsize=(10,5),sharex=True)
# #sns.barplot(x='target', y='open_interest',data=bar_market,
#             #order=sorted(eda_df_market['target'].unique()),ax=axes)
# sns.barplot(x='target', y='buy_volume',data=bar_market,
#             order=sorted(eda_df_market['target'].unique()),ax=axes,color="Blue")
# plt.show()

In [None]:
# sns.pairplot(data=eda_df_network, hue='target',palette="Set1")

In [None]:

# bar_network = group_network.agg({
#     "addresses_count_active" : "mean",
#     "addresses_count_sender": "mean",
#     "addresses_count_receiver": "mean",
#     "blockreward": "mean",
#     "fees_total": "mean",
#     "hashrate": "mean",
#     "supply_new": "mean",
#     "velocity_supply_total " : "mean"
# })
# bar_network



In [None]:
eda_after = eda_after.assign(
    liquidation_diff=eda_after["long_liquidations"] - eda_after["short_liquidations"],
    volume_diff=eda_after["buy_volume"] - eda_after["sell_volume"],
)
# bar_market_df=eda_df.groupby("target").agg({\
#     "liquidation_diff" : "mean",
#     "volume_diff" : "mean",
# }).reset_index()
# bar_market_df

In [None]:
# eda_df=df.loc[df["_type"]=="train"]
# eda_df
# missing_values=eda_df.isnull().sum()
# missing_values
# missing_percentage= (missing_values)/len(eda_df) *100
# missing_percentage

# sorted_missing_percentage=missing_percentage.sort_values(ascending=False)

missing_values2=eda_after.isnull().sum()
missing_percentage2= (missing_values2)/len(eda_after) *100
sorted_missing_percentage2=missing_percentage2.sort_values(ascending=False)
sorted_missing_percentage2

name_null=[]
#null 30 percent 이상 ( == null 100퍼센트 뺌 ) 
for col, val in sorted_missing_percentage2.items():
    if col=='target':
        continue
    else:
        if val>30:
            name_null.append(col)        
print(len(name_null))
name_null
for i in name_null:
  eda_after = eda_after.drop(columns=i)
eda_after.shape

In [14]:
eda_after.columns

Index(['ID', 'target', '_type',
       'hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations',
       'hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations',
       'hourly_market-data_liquidations_gate_io_all_symbol_long_liquidations_usd',
       'hourly_market-data_liquidations_gate_io_all_symbol_short_liquidations_usd',
       'hourly_market-data_funding-rates_bybit_funding_rates',
       'hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations',
       'hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations',
       ...
       'hourly_market-data_taker-buy-sell-stats_binance_taker_buy_sell_ratio',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_short_liquidations',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_long_liquidations_usd',
       'hourly_market-data_liquidations_huobi_global_btc_usdt_short_liqui

In [15]:
cols_dict : Dict[str,str] = {}
for i in eda_after.columns:
    
    parts=i.split("_")
    if len(parts)>=3:
        cols_dict[i]="_".join(parts[2:])
    else:
        cols_dict[i]=i

cols_dict2: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "funding-rates_all_exchange_funding_rates": "funding_rates",
    "liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "addresses-count_addresses_count_active": "active_count",
    "addresses-count_addresses_count_receiver": "receiver_count",
    "addresses-count_addresses_count_sender": "sender_count",
}
cols_dict
eda_after = eda_after[cols_dict.keys()].rename(cols_dict,axis=1)


In [16]:
eda_after.columns

Index(['ID', 'target', '_type',
       'liquidations_gate_io_all_symbol_long_liquidations',
       'liquidations_gate_io_all_symbol_short_liquidations',
       'liquidations_gate_io_all_symbol_long_liquidations_usd',
       'liquidations_gate_io_all_symbol_short_liquidations_usd',
       'funding-rates_bybit_funding_rates',
       'liquidations_htx_global_all_symbol_long_liquidations',
       'liquidations_htx_global_all_symbol_short_liquidations',
       ...
       'taker-buy-sell-stats_binance_taker_buy_sell_ratio',
       'liquidations_huobi_global_btc_usdt_long_liquidations',
       'liquidations_huobi_global_btc_usdt_short_liquidations',
       'liquidations_huobi_global_btc_usdt_long_liquidations_usd',
       'liquidations_huobi_global_btc_usdt_short_liquidations_usd',
       'liquidations_htx_global_btc_usdt_long_liquidations',
       'liquidations_htx_global_btc_usdt_short_liquidations',
       'liquidations_htx_global_btc_usdt_long_liquidations_usd',
       'liquidations_htx_g

In [17]:
eda_test=eda_test[cols_dict.keys()].rename(cols_dict,axis=1)


In [18]:
for i in cols_dict2.keys():
    eda_test = eda_test.rename(columns={i: cols_dict2[i]})


In [19]:
for i in cols_dict2.keys():
    eda_after = eda_after.rename(columns={i: cols_dict2[i]})


In [20]:
eda_after = eda_after.assign(
    liquidation_diff=eda_after["long_liquidations"]-eda_after["short_liquidations"],
    liquidation_usd_diff= eda_after['long_liquidations_usd']-eda_after['short_liquidations_usd'],
    volume_diff=eda_after["buy_volume"] - eda_after["sell_volume"],
    liquidation_diffg=np.sign(eda_after["long_liquidations"]-eda_after["short_liquidations"]),
    liquidation_usd_diffg=np.sign(eda_after['long_liquidations_usd']-eda_after['short_liquidations_usd']),
    volume_diffg=np.sign(eda_after["buy_volume"] - eda_after["sell_volume"]),
    buy_sell_volume_ratio=eda_after["buy_volume"] / (eda_after["sell_volume"] + 1)
)

In [None]:
category_cols : List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [ i for i in eda_after.columns if i not in ["ID","target","_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
]
conti_cols


In [None]:
def shift_feature(df,conti_cols,intervals):
    df_shift_dict=[
        df[conti_col].shift(interval).rename(f'{conti_col}_{interval}')
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

shift_list= shift_feature(
    df=eda_after,conti_cols=conti_cols, intervals=[_ for _ in range(1,24)]
)
shift_list

In [None]:
df = pd.concat([df, pd.concat(shift_list,axis=1)],axis=1)

In [21]:
_target=eda_after["target"]
_target

0       2.0
1       1.0
2       1.0
3       1.0
4       2.0
       ... 
8755    1.0
8756    1.0
8757    0.0
8758    2.0
8759    2.0
Name: target, Length: 8760, dtype: float64

In [32]:
eda_after.columns

Index(['ID', 'target', '_type',
       'liquidations_gate_io_all_symbol_long_liquidations',
       'liquidations_gate_io_all_symbol_short_liquidations',
       'liquidations_gate_io_all_symbol_long_liquidations_usd',
       'liquidations_gate_io_all_symbol_short_liquidations_usd',
       'funding-rates_bybit_funding_rates',
       'liquidations_htx_global_all_symbol_long_liquidations',
       'liquidations_htx_global_all_symbol_short_liquidations',
       ...
       'liquidations_htx_global_btc_usdt_long_liquidations_usd',
       'liquidations_htx_global_btc_usdt_short_liquidations_usd',
       'open-interest_htx_global_btc_usdt_open_interest', 'liquidation_diff',
       'liquidation_usd_diff', 'volume_diff', 'liquidation_diffg',
       'liquidation_usd_diffg', 'volume_diffg', 'buy_sell_volume_ratio'],
      dtype='object', length=222)

In [22]:
train_df = eda_after.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = eda_test.loc[df["_type"]=="test"].drop(columns=["_type"])

ID                                                            0
target                                                     2792
liquidations_gate_io_all_symbol_long_liquidations            29
liquidations_gate_io_all_symbol_short_liquidations           29
liquidations_gate_io_all_symbol_long_liquidations_usd        29
                                                           ... 
liquidations_htx_global_btc_usdt_long_liquidations            0
liquidations_htx_global_btc_usdt_short_liquidations           0
liquidations_htx_global_btc_usdt_long_liquidations_usd        0
liquidations_htx_global_btc_usdt_short_liquidations_usd       0
open-interest_htx_global_btc_usdt_open_interest               0
Length: 214, dtype: int64

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(train_df['funding-rates_okx_funding_rates'], lags=10)  # You can specify the number of lags to check, e.g., 30
plt.show()

In [23]:
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
)

In [24]:
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)


In [34]:
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 100,
    "random_state": 42,
    "verbose": 0,
}
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



acc: 0.4246575342465753, auroc: 0.6251516338225943


In [35]:
importance = lgb_model.feature_importance()
feature_names=train_df.drop(["target", "ID"], axis = 1).columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
})

# Sort by importance (optional)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame
print(feature_importance_df[:30])


                                               Feature  Importance
18        open-interest_binance_btc_busd_open_interest         451
171                    funding-rates_okx_funding_rates         449
193                funding-rates_deribit_funding_rates         428
36       open-interest_bitfinex_btc_usdt_open_interest         420
74          tokens-transferred_tokens_transferred_mean         389
42      taker-buy-sell-stats_deribit_taker_sell_volume         379
117                               coinbase_premium_gap         358
81    taker-buy-sell-stats_htx_global_taker_buy_volume         352
58        taker-buy-sell-stats_bybit_taker_sell_volume         343
175      taker-buy-sell-stats_bitmex_taker_sell_volume         340
94                                       funding_rates         329
218                              buy_sell_volume_ratio         328
73         tokens-transferred_tokens_transferred_total         326
118                             coinbase_premium_index        

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# from statsmodels.graphics.tsaplots import plot_acf
# plot_acf(train_df["tokens-transferred_tokens_transferred_total"], lags=100)  # You can specify the number of lags to check, e.g., 30
# plt.show()
# train_df["tokens-transferred_tokens_transferred_total"]

In [None]:
# experiment= train_df.groupby("target")
# dic={}
# for target, dataframe in experiment:
#     dataframe['ID']=pd.to_datetime(dataframe['ID'], format='%Y-%m-%d %H:%M:%S')
#     dataframe_resample=dataframe.resample('30D', on='ID').mean()
#     dic[target]=dataframe_resample
# df1=dic[0]
# df2=dic[1]
# df3=dic[2]
# df4=dic[3]


In [36]:
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)



In [37]:
test_df.isnull().sum()

ID                                                            0
target                                                     2792
liquidations_gate_io_all_symbol_long_liquidations            29
liquidations_gate_io_all_symbol_short_liquidations           29
liquidations_gate_io_all_symbol_long_liquidations_usd        29
                                                           ... 
liquidations_htx_global_btc_usdt_long_liquidations            0
liquidations_htx_global_btc_usdt_short_liquidations           0
liquidations_htx_global_btc_usdt_long_liquidations_usd        0
liquidations_htx_global_btc_usdt_short_liquidations_usd       0
open-interest_htx_global_btc_usdt_open_interest               0
Length: 214, dtype: int64

In [38]:
missing_values=test_df.isnull().sum()
missing_percentage= (missing_values)/len(test_df) *100
missing_percentage

sorted_missing_percentage=missing_percentage.sort_values(ascending=False)
sorted_missing_percentage
null=[]
for i in test_df.columns:
    if missing_values[i]!=0:
        null.append(i)
for i in null:
    eda_df[i]=eda_df[i].fillna(eda_df[i].mean())

liquidations_binance_btc_busd_long_liquidations_usd     100.0
target                                                  100.0
liquidations_binance_btc_busd_short_liquidations_usd    100.0
price-ohlcv_all_exchange_spot_btc_usd_volume            100.0
price-ohlcv_all_exchange_spot_btc_usd_close             100.0
                                                        ...  
liquidations_htx_global_btc_usd_long_liquidations         0.0
fees_fees_total_usd                                       0.0
fees_fees_total                                           0.0
blockreward_blockreward_usd                               0.0
open-interest_htx_global_btc_usdt_open_interest           0.0
Length: 214, dtype: float64

In [None]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [None]:
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)