# 이상치 처리 + interpolation + differencing

### Library Import

In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

  from .autonotebook import tqdm as notebook_tqdm


### Data Load

In [2]:
# 파일 호출
root_path = '/data/ephemeral/home/level1-classificationinmachinelearning-recsys-01'
data_path: str = os.path.join(root_path, 'data')
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}
for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:03<00:00, 30.37it/s]


# 데이터 전처리

## 결측 100% column 제거

In [4]:
# 각 열에서 누락된 값의 수를 계산
missing_values = df.isnull().sum()

# 누락된 값의 백분율 계산
missing_percentage = (missing_values / len(df)) * 100

null_columns = missing_percentage[missing_percentage != 100].keys()
df = df.loc[:,null_columns]
df

Unnamed: 0,ID,target,_type,hourly_network-data_block-count_block_count,hourly_market-data_funding-rates_deribit_funding_rates,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_bybit_btc_usd_open_interest,...,hourly_market-data_liquidations_gate_io_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_gate_io_btc_usdt_short_liquidations_usd,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_sell_ratio,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,0.0,0.0,0.0,0.0,379138258.0,...,0.0,0.0,415200.0,102600.0,0.801854,0.198146,4.046784,67987,37307,37752
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,0.0,0.0,0.0,0.0,382072537.0,...,0.0,0.0,1027600.0,71000.0,0.935372,0.064628,14.473239,30593,12342,20534
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,0.0,0.0,0.0,0.0,381636197.0,...,0.0,0.0,406600.0,115200.0,0.779226,0.220774,3.529514,33897,17737,19369
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,0.0,0.0,0.0,0.0,382229253.0,...,0.0,0.0,922400.0,142400.0,0.866266,0.133734,6.477528,32717,11421,23799
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,0.0,0.0,0.0,0.0,385126773.0,...,0.0,0.0,73000.0,102600.0,0.415718,0.584282,0.711501,45176,17320,31712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,,0.0,0.0,0.0,0.0,974276825.0,...,,,86000.0,203800.0,0.296756,0.703244,0.421982,29250,18154,13601
11548,2024-04-26 04:00:00,,test,,,0.0,0.0,0.0,0.0,970952780.0,...,,,382200.0,381000.0,0.500786,0.499214,1.003150,56580,31320,29096
11549,2024-04-26 05:00:00,,test,,,0.0,0.0,0.0,0.0,970067075.0,...,,,,,,,,51858,34083,22094
11550,2024-04-26 06:00:00,,test,,,0.0,0.0,0.0,0.0,972346702.0,...,,,,,,,,36270,26186,12668


## 이상치 처리 - IQR

In [5]:
def remove_outliers_as_nan(df):
    for col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # 이상치 부분을 NaN으로 대체
        df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), np.nan, df[col])
    return df


In [6]:
fix_df = df.iloc[:,:3]
filtered_df = remove_outliers_as_nan(df.iloc[:,3:])
df = pd.concat([fix_df, filtered_df], axis = 1)
df

Unnamed: 0,ID,target,_type,hourly_network-data_block-count_block_count,hourly_market-data_funding-rates_deribit_funding_rates,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_bybit_btc_usd_open_interest,...,hourly_market-data_liquidations_gate_io_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_gate_io_btc_usdt_short_liquidations_usd,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_sell_ratio,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,0.0,0.0,0.0,0.0,379138258.0,...,0.0,0.0,415200.0,102600.0,0.801854,0.198146,,67987.0,37307.0,37752.0
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,0.0,0.0,0.0,0.0,382072537.0,...,0.0,0.0,1027600.0,71000.0,0.935372,0.064628,,30593.0,12342.0,20534.0
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,0.0,0.0,0.0,0.0,381636197.0,...,0.0,0.0,406600.0,115200.0,0.779226,0.220774,,33897.0,17737.0,19369.0
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,0.0,0.0,0.0,0.0,382229253.0,...,0.0,0.0,922400.0,142400.0,0.866266,0.133734,,32717.0,11421.0,23799.0
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,0.0,0.0,0.0,0.0,385126773.0,...,0.0,0.0,73000.0,102600.0,0.415718,0.584282,0.711501,45176.0,17320.0,31712.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,,0.0,0.0,0.0,0.0,,...,,,86000.0,203800.0,0.296756,0.703244,0.421982,29250.0,18154.0,13601.0
11548,2024-04-26 04:00:00,,test,,,0.0,0.0,0.0,0.0,,...,,,382200.0,381000.0,0.500786,0.499214,1.003150,56580.0,31320.0,29096.0
11549,2024-04-26 05:00:00,,test,,,0.0,0.0,0.0,0.0,,...,,,,,,,,51858.0,34083.0,22094.0
11550,2024-04-26 06:00:00,,test,,,0.0,0.0,0.0,0.0,,...,,,,,,,,36270.0,26186.0,12668.0


## 결측치 처리 - interpolation

In [7]:
original_dict = dict(df.isnull().sum())
filtered_dict = {key: value for key, value in original_dict.items() if value != 0}
filtered_dict

{'target': 2792,
 'hourly_network-data_block-count_block_count': 24,
 'hourly_market-data_funding-rates_deribit_funding_rates': 907,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations': 1056,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations': 871,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd': 1056,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations_usd': 871,
 'hourly_market-data_open-interest_bybit_btc_usd_open_interest': 1220,
 'hourly_network-data_fees-transaction_fees_transaction_mean': 1095,
 'hourly_network-data_fees-transaction_fees_transaction_mean_usd': 1021,
 'hourly_network-data_fees-transaction_fees_transaction_median': 1229,
 'hourly_network-data_fees-transaction_fees_transaction_median_usd': 1170,
 'hourly_market-data_liquidations_bitmex_btc_usd_long_liquidations': 1986,
 'hourly_market-data_liquidations_bitmex_btc_usd_short_liquidations': 1395,
 'hourly_market-data_liquidations_bit

In [8]:
# # 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.interpolate(method='linear').assign(target = _target)
df = df.bfill()

  df = df.interpolate(method='linear').assign(target = _target)


## 모든 행이 0인 열 삭제

In [9]:
df = df.loc[:, (df != 0).any(axis=0)]
df

Unnamed: 0,ID,target,_type,hourly_network-data_block-count_block_count,hourly_market-data_funding-rates_deribit_funding_rates,hourly_market-data_open-interest_bybit_btc_usd_open_interest,hourly_network-data_fees-transaction_fees_transaction_mean,hourly_network-data_fees-transaction_fees_transaction_mean_usd,hourly_network-data_fees-transaction_fees_transaction_median,hourly_network-data_fees-transaction_fees_transaction_median_usd,...,hourly_market-data_liquidations_gate_io_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_gate_io_btc_usdt_short_liquidations_usd,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_sell_ratio,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,379138258.0,0.000049,0.809244,0.000014,0.235973,...,0.0,0.00000,415200.0,102600.0,0.801854,0.198146,0.711501,67987.0,37307.0,37752.0
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,382072537.0,0.000044,0.726525,0.000014,0.237108,...,0.0,0.00000,1027600.0,71000.0,0.935372,0.064628,0.711501,30593.0,12342.0,20534.0
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,381636197.0,0.000056,0.932723,0.000014,0.234878,...,0.0,0.00000,406600.0,115200.0,0.779226,0.220774,0.711501,33897.0,17737.0,19369.0
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,382229253.0,0.000042,0.692505,0.000014,0.234608,...,0.0,0.00000,922400.0,142400.0,0.866266,0.133734,0.711501,32717.0,11421.0,23799.0
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,385126773.0,0.000050,0.820970,0.000014,0.237996,...,0.0,0.00000,73000.0,102600.0,0.415718,0.584282,0.711501,45176.0,17320.0,31712.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,-0.001519,842585772.0,0.000233,15.000864,0.000112,4.800155,...,0.0,27302.37792,86000.0,203800.0,0.296756,0.703244,0.421982,29250.0,18154.0,13601.0
11548,2024-04-26 04:00:00,,test,3.0,-0.001519,842585772.0,0.000253,13.181418,0.000093,6.000194,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,56580.0,31320.0,29096.0
11549,2024-04-26 05:00:00,,test,3.0,-0.001519,842585772.0,0.000177,11.361971,0.000066,4.248020,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,51858.0,34083.0,22094.0
11550,2024-04-26 06:00:00,,test,3.0,-0.001519,842585772.0,0.000144,9.286892,0.000055,3.565609,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,36270.0,26186.0,12668.0


## 열의 plot이 상수인 열 삭제

In [10]:
same_value_columns = df.apply(lambda x: x.nunique() == 1)
same_value_columns = df.loc[:,same_value_columns].columns
df.drop(same_value_columns, axis=1, inplace = True)

## columns 이름 단순화

In [11]:
original_dic = {'ID' : 'ID', 'target' : 'target', '_type' : '_type'}
renamed_dic = {}
for column in df.columns[3:]:
    renamed_col = '_'.join(column.split('_')[3:])
    renamed_dic[column] = renamed_col
original_dic.update(renamed_dic)

In [12]:
df = df[original_dic.keys()].rename(original_dic, axis=1)
df

Unnamed: 0,ID,target,_type,block_count,deribit_funding_rates,bybit_btc_usd_open_interest,fees_transaction_mean,fees_transaction_mean_usd,fees_transaction_median,fees_transaction_median_usd,...,gate_io_btc_usdt_long_liquidations_usd,gate_io_btc_usdt_short_liquidations_usd,htx_global_taker_buy_volume,htx_global_taker_sell_volume,htx_global_taker_buy_ratio,htx_global_taker_sell_ratio,htx_global_taker_buy_sell_ratio,addresses_count_active,addresses_count_sender,addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,379138258.0,0.000049,0.809244,0.000014,0.235973,...,0.0,0.00000,415200.0,102600.0,0.801854,0.198146,0.711501,67987.0,37307.0,37752.0
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,382072537.0,0.000044,0.726525,0.000014,0.237108,...,0.0,0.00000,1027600.0,71000.0,0.935372,0.064628,0.711501,30593.0,12342.0,20534.0
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,381636197.0,0.000056,0.932723,0.000014,0.234878,...,0.0,0.00000,406600.0,115200.0,0.779226,0.220774,0.711501,33897.0,17737.0,19369.0
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,382229253.0,0.000042,0.692505,0.000014,0.234608,...,0.0,0.00000,922400.0,142400.0,0.866266,0.133734,0.711501,32717.0,11421.0,23799.0
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,385126773.0,0.000050,0.820970,0.000014,0.237996,...,0.0,0.00000,73000.0,102600.0,0.415718,0.584282,0.711501,45176.0,17320.0,31712.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,-0.001519,842585772.0,0.000233,15.000864,0.000112,4.800155,...,0.0,27302.37792,86000.0,203800.0,0.296756,0.703244,0.421982,29250.0,18154.0,13601.0
11548,2024-04-26 04:00:00,,test,3.0,-0.001519,842585772.0,0.000253,13.181418,0.000093,6.000194,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,56580.0,31320.0,29096.0
11549,2024-04-26 05:00:00,,test,3.0,-0.001519,842585772.0,0.000177,11.361971,0.000066,4.248020,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,51858.0,34083.0,22094.0
11550,2024-04-26 06:00:00,,test,3.0,-0.001519,842585772.0,0.000144,9.286892,0.000055,3.565609,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,36270.0,26186.0,12668.0


## statationary using differencing

In [13]:
from statsmodels.tsa.stattools import adfuller

# ADF 테스트 함수
def adf_test(series):
    result = adfuller(series)
    #print(f'ADF Statistic: {result[0]}')
    #print(f'p-value: {result[1]}')
    if result[1] < 0.05:
        #print("The series is stationary.")
        return True
    else:
        #print("The series is non-stationary.")
        return False

# 각 열에 대해 ADF 테스트 수행
non_stationary_col = []
for column in tqdm(df.columns[3:]):
    #print(f"Column: {column}")
    if not adf_test(df[column]):
        non_stationary_col.append(column)
    #print("\n")


  0%|          | 0/166 [00:00<?, ?it/s]

100%|██████████| 166/166 [01:57<00:00,  1.41it/s]


In [14]:
non_stationary_col

['bybit_btc_usd_open_interest',
 'supply_total',
 'bybit_all_symbol_open_interest',
 'bitfinex_all_symbol_open_interest',
 'bybit_btc_usdt_open_interest',
 'okx_btc_usd_open_interest',
 'binance_btc_busd_open_interest',
 'deribit_all_symbol_open_interest',
 'difficulty',
 'kraken_btc_usd_open_interest',
 'kraken_all_symbol_open_interest',
 'binance_all_symbol_open_interest',
 'utxo_count',
 'deribit_btc_usd_open_interest',
 'okx_btc_usdt_open_interest',
 'all_exchange_spot_btc_usd_close',
 'all_exchange_all_symbol_open_interest',
 'bitfinex_btc_usdt_open_interest',
 'binance_btc_usd_open_interest',
 'okx_all_symbol_open_interest',
 'gate_io_btc_usd_open_interest',
 'binance_btc_usdt_open_interest']

In [15]:
df[non_stationary_col] = df[non_stationary_col].diff().dropna()

In [16]:
df = df.bfill()
df

Unnamed: 0,ID,target,_type,block_count,deribit_funding_rates,bybit_btc_usd_open_interest,fees_transaction_mean,fees_transaction_mean_usd,fees_transaction_median,fees_transaction_median_usd,...,gate_io_btc_usdt_long_liquidations_usd,gate_io_btc_usdt_short_liquidations_usd,htx_global_taker_buy_volume,htx_global_taker_sell_volume,htx_global_taker_buy_ratio,htx_global_taker_sell_ratio,htx_global_taker_buy_sell_ratio,addresses_count_active,addresses_count_sender,addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,2934279.0,0.000049,0.809244,0.000014,0.235973,...,0.0,0.00000,415200.0,102600.0,0.801854,0.198146,0.711501,67987.0,37307.0,37752.0
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,2934279.0,0.000044,0.726525,0.000014,0.237108,...,0.0,0.00000,1027600.0,71000.0,0.935372,0.064628,0.711501,30593.0,12342.0,20534.0
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,-436340.0,0.000056,0.932723,0.000014,0.234878,...,0.0,0.00000,406600.0,115200.0,0.779226,0.220774,0.711501,33897.0,17737.0,19369.0
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,593056.0,0.000042,0.692505,0.000014,0.234608,...,0.0,0.00000,922400.0,142400.0,0.866266,0.133734,0.711501,32717.0,11421.0,23799.0
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,2897520.0,0.000050,0.820970,0.000014,0.237996,...,0.0,0.00000,73000.0,102600.0,0.415718,0.584282,0.711501,45176.0,17320.0,31712.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,-0.001519,0.0,0.000233,15.000864,0.000112,4.800155,...,0.0,27302.37792,86000.0,203800.0,0.296756,0.703244,0.421982,29250.0,18154.0,13601.0
11548,2024-04-26 04:00:00,,test,3.0,-0.001519,0.0,0.000253,13.181418,0.000093,6.000194,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,56580.0,31320.0,29096.0
11549,2024-04-26 05:00:00,,test,3.0,-0.001519,0.0,0.000177,11.361971,0.000066,4.248020,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,51858.0,34083.0,22094.0
11550,2024-04-26 06:00:00,,test,3.0,-0.001519,0.0,0.000144,9.286892,0.000055,3.565609,...,0.0,27302.37792,382200.0,381000.0,0.500786,0.499214,1.003150,36270.0,26186.0,12668.0


# 상관관계 높은 것들끼리 묶어서 차원 축소

In [17]:
def find_high_corr_groups(correlation_matrix, threshold=0.8):
    G = nx.Graph()
    for col1 in correlation_matrix.columns:
        for col2 in correlation_matrix.columns:
            if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > threshold:
                G.add_edge(col1, col2)
    groups = list(nx.connected_components(G))
    return [list(group) for group in groups]

In [18]:
correlation_matrix = df.iloc[:,3:].corr()
groups = find_high_corr_groups(correlation_matrix, threshold=0.8)
groups

[['hashrate',
  'addresses_count_active',
  'block_count',
  'addresses_count_sender',
  'blockreward',
  'supply_total',
  'blockreward_usd',
  'supply_new'],
 ['fees_block_mean_usd',
  'fees_total',
  'fees_reward_percent',
  'fees_total_usd',
  'fees_transaction_mean_usd',
  'fees_transaction_median',
  'fees_block_mean',
  'fees_transaction_median_usd',
  'fees_transaction_mean'],
 ['bybit_taker_buy_volume',
  'all_exchange_taker_sell_volume',
  'deribit_taker_sell_volume',
  'okx_taker_sell_volume',
  'deribit_taker_buy_volume',
  'bybit_taker_sell_volume',
  'binance_taker_buy_volume',
  'binance_taker_sell_volume',
  'okx_taker_buy_volume',
  'all_exchange_taker_buy_volume'],
 ['deribit_taker_buy_ratio',
  'deribit_taker_sell_ratio',
  'deribit_taker_buy_sell_ratio'],
 ['binance_taker_sell_ratio',
  'all_exchange_taker_sell_ratio',
  'binance_taker_buy_sell_ratio',
  'all_exchange_taker_buy_ratio',
  'binance_taker_buy_ratio',
  'all_exchange_taker_buy_sell_ratio'],
 ['bybit_all

In [19]:
def apply_pca_to_groups(df, groups, n_components=1):
    grouped_list = [word for group in groups for word in group]
    non_grouped_list = list(set(df.columns) - set(grouped_list))
    
    X = df[non_grouped_list].drop(['ID', '_type', 'target'], axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    non_grouped_df = pd.DataFrame(X_scaled, columns = X.columns)

    pca_results = []
    for idx, group in enumerate(groups):
        if len(group) > 1:  # Apply PCA only if the group has more than one variable
            X = df[group]
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            pca = PCA(n_components=n_components)
            pca_result = pca.fit_transform(X_scaled)
            # Add PCA result to list
            pca_results.append(pd.DataFrame(pca_result, columns = [f"pca_{idx}"]))
    
    
    grouped_df = pd.concat(pca_results, axis=1)
    fixed_df = df[['ID', '_type', 'target']]

    concat_df = pd.concat([fixed_df, non_grouped_df, grouped_df], axis=1)
    return concat_df

In [20]:
df = apply_pca_to_groups(df, groups, n_components=1)

In [21]:
df

Unnamed: 0,ID,_type,target,binance_btc_usd_short_liquidations,difficulty,bybit_btc_usdt_short_liquidations_usd,bybit_btc_usdt_long_liquidations_usd,addresses_count_receiver,okx_funding_rates,binance_funding_rates,...,pca_28,pca_29,pca_30,pca_31,pca_32,pca_33,pca_34,pca_35,pca_36,pca_37
0,2023-01-01 00:00:00,train,2.0,-0.412814,-0.037913,-0.544052,-0.523896,1.177336,-0.685720,0.387120,...,-0.526221,-1.486780,-3.551595,-0.820779,-0.767823,-0.200801,1.407583,2.397854,-0.979811,-1.033926
1,2023-01-01 01:00:00,train,1.0,-0.412814,-0.037913,-0.544052,-0.523896,-0.700406,-0.685720,0.387120,...,1.392256,-1.585170,-5.139551,-0.820779,-0.767823,-0.200801,2.896871,2.356022,-0.981539,-0.935486
2,2023-01-01 02:00:00,train,1.0,-0.412814,-0.037913,-0.544052,-0.523896,-0.827458,-0.685720,0.387120,...,-0.553163,-1.447549,-3.282474,-0.820779,-0.767823,0.120891,-2.190444,2.351620,-0.981539,-1.033926
3,2023-01-01 03:00:00,train,1.0,-0.412814,-0.037913,-0.544052,-0.523896,-0.344335,-0.685720,0.387120,...,1.062694,-1.362859,-4.317658,-0.820779,-0.767823,0.119991,1.394724,2.349269,-0.896165,-1.033926
4,2023-01-01 04:00:00,train,2.0,-0.412814,-0.037913,-0.544052,-0.523896,0.518632,-0.685720,0.387120,...,-1.598238,-1.486780,1.040790,-0.820779,-0.767823,0.021359,-0.730809,2.336297,-0.929549,-1.033926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,test,,-0.412814,-0.037913,-0.544052,-0.523896,-1.456498,0.263581,-0.616240,...,-1.452080,-0.439767,3.117067,-1.694353,-0.767823,0.653492,-0.617014,-1.553145,-0.784462,-0.969490
11548,2024-04-26 04:00:00,test,,-0.412814,-0.037913,-0.544052,-0.431421,0.233339,0.263581,-0.361797,...,-0.988890,-0.164222,1.901754,-1.037307,-0.767823,0.178957,-1.948306,-1.645215,0.842338,-0.995248
11549,2024-04-26 05:00:00,test,,-0.412814,-0.037913,0.864820,-0.339453,-0.530278,0.263581,-0.361797,...,-0.988890,-0.164222,1.901754,-1.037307,-0.215736,-0.223907,-0.314871,-1.712975,-0.483335,0.350171
11550,2024-04-26 06:00:00,test,,-0.412814,-0.037913,-0.544052,-0.523896,-1.558248,0.263581,-0.361797,...,-0.988890,-0.164222,1.901754,-1.037307,-0.767823,0.052585,0.269667,-1.694205,-0.758669,-0.594980


In [22]:

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### Model Training

In [23]:
# train_test_split 으로 valid set, train set 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
    shuffle=False
)

In [25]:
# 모델 최적화를 위한 함수
def optimize_model(trial, model_name):
    if model_name == 'RandomForest':
        # RandomForest는 GPU 가속을 기본적으로 지원하지 않음
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            class_weight=class_weight,
            random_state=42
        )
        
    elif model_name == 'XGBoost':
        # XGBoost에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        gamma = trial.suggest_float('gamma', 0, 5)
        
        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            gamma=gamma,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method = "hist", 
            device = "cuda",
            random_state=42
        )

    elif model_name == 'LightGBM':
        # LightGBM에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        num_leaves = trial.suggest_int('num_leaves', 20, 150)
        min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        
        model = LGBMClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            min_child_samples=min_child_samples,
            class_weight=class_weight,
            device='gpu',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'CatBoost':
        # CatBoost에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        depth = trial.suggest_int('depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 10)
        class_weights = trial.suggest_categorical('class_weights', [None, [1, 10, 5, 20]])
        
        model = CatBoostClassifier(
            n_estimators=n_estimators,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            class_weights=class_weights,
            task_type='GPU',  # GPU 사용 설정
            verbose=0,
            random_state=42
        )
    
    # 교차 검증으로 모델 성능 평가
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

# 모델별로 Optuna 스터디 생성 및 최적화
def optimize_each_model(model_name):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optimize_model(trial, model_name), n_trials=10)
    return study

# 모델 리스트
models = ['RandomForest', 'XGBoost', 'CatBoost']
#models = ['XGBoost', 'CatBoost']
best_params = {}

# 각 모델에 대해 최적화 실행
for model_name in models:
    print(f"Optimizing {model_name}...")
    study = optimize_each_model(model_name)
    best_params[model_name] = study.best_trial.params

# 각 모델의 최적의 하이퍼파라미터 출력
for model_name, params in best_params.items():
    print(f"\nBest hyperparameters for {model_name}:")
    print(params)



[I 2024-09-21 19:07:06,305] A new study created in memory with name: no-name-c3c7b8a1-5154-4c47-8e40-f2bc50036a7d


Optimizing RandomForest...


[I 2024-09-21 19:07:22,238] Trial 0 finished with value: 0.4297945205479452 and parameters: {'n_estimators': 241, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 3, 'class_weight': None}. Best is trial 0 with value: 0.4297945205479452.
[I 2024-09-21 19:07:28,427] Trial 1 finished with value: 0.3545947488584475 and parameters: {'n_estimators': 55, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 3, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.4297945205479452.
[I 2024-09-21 19:08:01,629] Trial 2 finished with value: 0.3697203196347032 and parameters: {'n_estimators': 295, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 3, 'class_weight': None}. Best is trial 0 with value: 0.4297945205479452.
[I 2024-09-21 19:08:03,759] Trial 3 finished with value: 0.4367865296803653 and parameters: {'n_estimators': 63, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 3, 'class_weight': None}. Best is trial 3 with value: 0.4367865296803653.
[I

Optimizing XGBoost...


[I 2024-09-21 19:09:04,827] Trial 0 finished with value: 0.3056506849315069 and parameters: {'n_estimators': 82, 'max_depth': 5, 'learning_rate': 0.2974076094584698, 'gamma': 4.248281245297094}. Best is trial 0 with value: 0.3056506849315069.
[I 2024-09-21 19:09:07,897] Trial 1 finished with value: 0.3030821917808219 and parameters: {'n_estimators': 137, 'max_depth': 16, 'learning_rate': 0.2615218329894748, 'gamma': 0.30807554832246453}. Best is trial 0 with value: 0.3056506849315069.
[I 2024-09-21 19:09:10,138] Trial 2 finished with value: 0.30664954337899547 and parameters: {'n_estimators': 125, 'max_depth': 13, 'learning_rate': 0.246611600945172, 'gamma': 0.7605332590943314}. Best is trial 2 with value: 0.30664954337899547.
[I 2024-09-21 19:09:17,268] Trial 3 finished with value: 0.30336757990867574 and parameters: {'n_estimators': 79, 'max_depth': 15, 'learning_rate': 0.013223612703536055, 'gamma': 0.7847592091152678}. Best is trial 2 with value: 0.30664954337899547.
[I 2024-09-21 

Optimizing CatBoost...


[I 2024-09-21 19:09:39,838] Trial 0 finished with value: 0.3924086757990868 and parameters: {'n_estimators': 198, 'depth': 10, 'learning_rate': 0.2671628923445708, 'l2_leaf_reg': 6.196370391556606, 'class_weights': None}. Best is trial 0 with value: 0.3924086757990868.
[I 2024-09-21 19:09:45,987] Trial 1 finished with value: 0.4073915525114155 and parameters: {'n_estimators': 54, 'depth': 10, 'learning_rate': 0.1747521329270909, 'l2_leaf_reg': 3.9506128186670333, 'class_weights': None}. Best is trial 1 with value: 0.4073915525114155.
[I 2024-09-21 19:09:55,683] Trial 2 finished with value: 0.37642694063926935 and parameters: {'n_estimators': 246, 'depth': 8, 'learning_rate': 0.24051027814542253, 'l2_leaf_reg': 3.696014526206549, 'class_weights': None}. Best is trial 1 with value: 0.4073915525114155.
[I 2024-09-21 19:10:02,185] Trial 3 finished with value: 0.3859874429223744 and parameters: {'n_estimators': 131, 'depth': 8, 'learning_rate': 0.23538087626568446, 'l2_leaf_reg': 3.82198973


Best hyperparameters for RandomForest:
{'n_estimators': 63, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 3, 'class_weight': None}

Best hyperparameters for XGBoost:
{'n_estimators': 82, 'max_depth': 17, 'learning_rate': 0.022048296942747566, 'gamma': 3.641654414667259}

Best hyperparameters for CatBoost:
{'n_estimators': 54, 'depth': 10, 'learning_rate': 0.1747521329270909, 'l2_leaf_reg': 3.9506128186670333, 'class_weights': None}


In [29]:
def soft_voting(predictions):
    voting_result = np.argmax(np.sum(predictions, axis=0), axis=1)
    return voting_result

In [33]:
# 모델 학습 및 예측 함수
def train_and_predict(model_name, best_params):
    model = None  # 모델을 None으로 초기화
    
    if model_name == 'RandomForest':
        model = RandomForestClassifier(
            **best_params,
            random_state=42
        )
    
    elif model_name == 'XGBoost':
        model = XGBClassifier(
            **best_params,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method='gpu_hist',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'LightGBM':
        model = LGBMClassifier(
            **best_params,
            device='gpu',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(
            **best_params,
            task_type='GPU',  # GPU 사용 설정
            verbose=0,
            random_state=42
        )
    
    if model is not None:  # 모델이 None이 아닐 때만 학습
        # 모델 학습
        model.fit(X_train, y_train)

        # 테스트 세트에 대한 예측
        y_pred_proba = model.predict_proba(X_valid)
        return y_pred_proba
    else:
        raise ValueError(f"Unsupported model name: {model_name}")



# 각 모델에 대해 학습 및 예측 수행
ensemble_list = []
for model_name, params in best_params.items():
    print(f"\nTraining and predicting with {model_name} using best parameters...")
    predictions = train_and_predict(model_name, params)
    ensemble_list.append(predictions)
    # # 예측 결과 출력
    # print(f"Predictions for {model_name}: {predictions[:10]}")  # 첫 10개 예측 결과 출력

ensemble_preds = soft_voting(ensemble_list)

# score check
accuracy = accuracy_score(y_valid, ensemble_preds)



Training and predicting with RandomForest using best parameters...

Training and predicting with XGBoost using best parameters...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"




Training and predicting with CatBoost using best parameters...


In [35]:
accuracy

0.4457762557077626

In [26]:
# performance 체크후 전체 학습 데이터로 다시 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)



### Inference

In [156]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

### Output File Save

In [157]:
# output file 할당후 save 
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)

In [26]:
pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
