### Library Import

In [237]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

### Data Load

In [238]:
# 파일 호출
root_path = '/data/ephemeral/home/level1-classificationinmachinelearning-recsys-01'
data_path: str = os.path.join(root_path, 'data')
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [239]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:03<00:00, 34.95it/s]


### Data Preprocessing

#### 결측치 처리

In [240]:
# missing value check

train_df = df.loc[df["_type"] == "train"]
# 각 열에서 누락된 값의 수 & 백분율 계산
missing_values = train_df.isnull().sum()
missing_percentage = (missing_values / len(train_df)) * 100

# 누락된 값 비율을 기준으로 열 정렬
sorted_missing_percentage = missing_percentage.sort_values(ascending=False)

# missing_value의 비율이 100%가 아닌 column만 추출
non_missing_columns = sorted_missing_percentage[sorted_missing_percentage != 100.0].index.tolist()
non_missing_columns.remove('ID')
non_missing_columns.remove('target')
non_missing_columns.remove('_type')

In [241]:
new_data = train_df[['ID','target', '_type'] + non_missing_columns]


In [242]:
# 이동평균으로 결측치 대체
new_df_stab = new_data[non_missing_columns]

# train
window_size = 3
new_df_stab = new_df_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_df_stab = new_df_stab.fillna(method='ffill').fillna(method='bfill')


# 결측치 처리한 new_df 정의
new_train_df = pd.concat([new_data[['ID','target','_type']], new_df_stab], axis=1)

  new_df_stab = new_df_stab.fillna(method='ffill').fillna(method='bfill')


In [243]:
# nan or inf 데이터 여부 확인
for col in non_missing_columns:
    if (np.isnan(new_train_df[col]).any()) | (np.isinf(new_train_df[col]).any()):
        print(col)
        break

In [244]:
test_df = df.loc[df["_type"] == "test"]
new_test_df = test_df[['ID','target','_type'] + non_missing_columns]

new_test_stab = new_test_df[non_missing_columns]
# test
window_size = 3
new_test_stab = new_test_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_test_stab = new_test_stab.fillna(method='ffill').fillna(method='bfill')

new_test_df = pd.concat([new_test_df[['ID','target','_type']], new_test_stab], axis=1)

  new_test_stab = new_test_stab.fillna(method='ffill').fillna(method='bfill')


In [245]:
# 결측치 비율을 계산
missing_percentage = new_test_df.isnull().mean() * 100

# 결측치 비율이 100%인 컬럼 이름만 출력
columns_with_all_missing = missing_percentage[missing_percentage >= 50].index.tolist()

# 100% 결측치가 있는 컬럼 출력
columns_with_all_missing = [col for col in columns_with_all_missing if col not in ['target', 'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']]
columns_with_all_missing

['hourly_market-data_open-interest_binance_btc_busd_open_interest',
 'hourly_market-data_liquidations_binance_btc_busd_long_liquidations',
 'hourly_market-data_liquidations_binance_btc_busd_short_liquidations',
 'hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd',
 'hourly_market-data_liquidations_binance_btc_busd_short_liquidations_usd',
 'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume']

In [246]:
# train_df와 test_df에서 columns_with_all_missing에 있는 컬럼 삭제
new_train_df = new_train_df.drop(columns=columns_with_all_missing, errors='ignore')
new_test_df = new_test_df.drop(columns=columns_with_all_missing, errors='ignore')

#### 이상치 처리

In [247]:
# 이동평균을 기반으로 이상치를 처리하는 함수
def replace_outlier(df, window=3, threshold=2):
    df_cleaned = df.copy()
    
    # 숫자형 컬럼들에 대해 처리
    for column in df_cleaned.select_dtypes(include=[np.number]).columns:
        # 이동평균과 표준편차 계산
        rolling_mean = df_cleaned[column].rolling(window=window, min_periods=1).mean()
        rolling_std = df_cleaned[column].rolling(window=window, min_periods=1).std()

        # 이상치 기준 설정
        outliers = np.abs(df_cleaned[column] - rolling_mean) > (threshold * rolling_std)

        # 이상치를 이동평균으로 대체
        df_cleaned.loc[outliers, column] = rolling_mean[outliers]
    
    return df_cleaned

# 이동평균 기반 이상치 처리 적용
cleaned_train_df = replace_outlier(new_train_df)
cleaned_test_df = replace_outlier(new_test_df)

#### 정규화

In [248]:
# Standardization으로 정규화
from sklearn.preprocessing import StandardScaler

def standardization(train_df, test_df):
    features_to_scale = [col for col in train_df.columns if col not in ['ID', 'target', '_type']]

    scaler = StandardScaler()

    # 훈련 데이터 정규화
    train_df_scaled = train_df.copy()
    train_df_scaled[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])

    # 테스트 데이터 정규화
    test_df_scaled = test_df.copy()
    test_df_scaled[features_to_scale] = scaler.transform(test_df[features_to_scale])

    return train_df_scaled, test_df_scaled

# 함수 호출
std_train_df, std_test_df = standardization(cleaned_train_df, cleaned_test_df)

#### price 예측

In [249]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error

# # 타겟과 피처 설정
# y_train = std_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']
# X_train = std_train_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')

# # 훈련 데이터와 검증 데이터 나누기
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # 모델 훈련
# model = RandomForestRegressor()
# model.fit(X_train_split, y_train_split)

# # 검증 데이터에서 예측
# y_val_pred = model.predict(X_val)

# # 성능 평가
# mse = mean_squared_error(y_val, y_val_pred)
# print("Mean Squared Error on Validation Set:", mse)

# # 모델 훈련
# model = RandomForestRegressor()
# model.fit(X_train, y_train)

# # test_df에서 예측
# X_test = std_test_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')
# y_pred = model.predict(X_test)

# # new_test_df에 y_pred 값을 추가
# std_test_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'] = y_pred

# # 결과 확인
# std_test_df.head()

In [250]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 타겟과 피처 설정
y_train = std_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']
X_train = std_train_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')

# 훈련 데이터와 검증 데이터 나누기
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# XGBoost 모델 설정 (GPU 사용)
model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0)

# 모델 훈련
model.fit(X_train_split, y_train_split)

# 검증 데이터에서 예측
y_val_pred = model.predict(X_val)

# 성능 평가
mse = mean_squared_error(y_val, y_val_pred)
print("Mean Squared Error on Validation Set:", mse)

# 전체 훈련 데이터로 모델 재훈련
model = xgb.XGBRegressor(tree_method='gpu_hist', gpu_id=0)
model.fit(X_train, y_train)

# test_df에서 예측
X_test = std_test_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')
y_pred = model.predict(X_test)

# std_test_df에 y_pred 값을 추가
std_test_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'] = y_pred

# 결과 확인
std_test_df.head()


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



Mean Squared Error on Validation Set: 0.000915254356828649



    E.g. tree_method = "hist", device = "cuda"



Unnamed: 0,ID,target,_type,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,hourly_network-data_block-interval_block_interval,...,hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio,hourly_market-data_open-interest_binance_all_symbol_open_interest,hourly_market-data_open-interest_htx_global_all_symbol_open_interest,hourly_market-data_funding-rates_binance_funding_rates,hourly_network-data_fees-transaction_fees_transaction_median,hourly_network-data_fees-transaction_fees_transaction_median_usd,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
8760,2024-01-01 00:00:00,,test,0.862267,-0.368791,1.123435,-1.106685,0.628514,0.047456,0.999757,...,-1.385256,1.425892,0.854505,1.264561,4.297192,2.646857,2.993445,-0.741689,-0.333541,-1.275562
8761,2024-01-01 01:00:00,,test,1.129189,-0.359145,1.203588,-1.186934,0.709193,0.106802,-0.966605,...,-0.715146,0.640422,0.883924,1.370697,4.114355,2.044553,2.346136,1.686148,2.749899,-0.856214
8762,2024-01-01 02:00:00,,test,1.435859,-0.671636,-0.004335,0.022434,-0.1733,-0.671125,-0.474909,...,0.588947,-0.624704,0.866097,1.359895,3.984288,1.901846,2.198961,0.309516,0.872912,-0.958328
8763,2024-01-01 03:00:00,,test,1.686091,0.141968,-1.494569,1.51445,-0.737805,-0.41543,-0.632608,...,1.615492,-1.434604,0.813484,1.330135,3.890104,1.786898,2.070217,-0.389254,0.519154,-1.844483
8764,2024-01-01 04:00:00,,test,1.702782,-0.606351,2.824104,-2.809388,4.499324,0.773042,-0.265336,...,-0.435374,0.342354,0.844352,1.359704,3.678164,1.907124,2.187542,0.489017,1.39411,-1.329516


In [251]:
df = pd.concat([std_train_df, std_test_df], ignore_index=True)
df.head()

Unnamed: 0,ID,target,_type,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,hourly_network-data_block-interval_block_interval,...,hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio,hourly_market-data_open-interest_binance_all_symbol_open_interest,hourly_market-data_open-interest_htx_global_all_symbol_open_interest,hourly_market-data_funding-rates_binance_funding_rates,hourly_network-data_fees-transaction_fees_transaction_median,hourly_network-data_fees-transaction_fees_transaction_median_usd,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,0.242277,-0.790786,0.677313,-0.66003,0.248331,-0.739005,-0.669584,...,0.500339,-0.547936,-2.297635,-0.680018,0.379556,-0.527308,-0.508875,0.648656,0.364113,0.946969
1,2023-01-01 01:00:00,1.0,train,0.242277,-0.8129,2.230195,-2.214769,2.351257,-0.555248,0.215773,...,-2.71339,3.364286,-2.290755,-0.649471,0.379556,-0.526721,-0.508594,-1.300892,-1.394462,-0.773384
2,2023-01-01 02:00:00,1.0,train,0.242277,-0.783052,-1.185413,1.204924,-0.64682,-0.844533,-0.824657,...,-0.18768,0.091498,-2.288459,-0.656938,0.379556,-0.52809,-0.509145,-1.128637,-1.014429,-0.889786
3,2023-01-01 03:00:00,1.0,train,0.242277,-0.841254,0.671624,-0.654333,0.244119,-0.809874,-0.07015,...,0.684034,-0.705774,-2.28858,-0.654821,0.379556,-0.52809,-0.509212,-1.190157,-1.459339,-0.447158
4,2023-01-01 04:00:00,2.0,train,0.242277,-0.660023,-0.369732,0.388268,-0.346476,-0.701877,0.017596,...,0.141937,-0.224818,-2.285513,-0.635397,0.379556,-0.526135,-0.508374,-0.540603,-1.043804,0.343477


### Feature engineering

In [252]:
# # 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
# cols_dict: Dict[str, str] = {
#     "ID": "ID",
#     "target": "target",
#     "_type": "_type",
#     "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
#     "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
#     "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
#     "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
#     "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
#     "hourly_network-data_addresses-count_addresses_count_active": "active_count",
#     "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
#     "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
#     "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close" : "close",
# }
# df = df[cols_dict.keys()].rename(cols_dict, axis=1)
# df.shape

In [253]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close" : "close",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 17)

In [254]:
# # eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
# df = df.assign(
#     liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
#     liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
#     volume_diff=df["buy_volume"] - df["sell_volume"],
#     liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
#     liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
#     volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
#     buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
#     close_diff = df['close'].diff().fillna(0),
#     close_diffg = np.sign(df['close'].diff().fillna(0))
# )
# # category, continuous 열을 따로 할당해둠
# category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg", "close_diffg"]
# conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
#     "buy_sell_volume_ratio",
#     "liquidation_diff",
#     "liquidation_usd_diff",
#     "volume_diff",
#     "close_diff"
# ]

In [255]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    close_diff = df['close'].diff().fillna(0),
    close_diffg = np.sign(df['close'].diff().fillna(0))
)
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "volume_diffg", "close_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "volume_diff",
    "close_diff"
]

In [256]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [257]:
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)
df

Unnamed: 0,ID,target,_type,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,short_liquidations,buy_ratio,buy_sell_ratio,...,close_diff_14,close_diff_15,close_diff_16,close_diff_17,close_diff_18,close_diff_19,close_diff_20,close_diff_21,close_diff_22,close_diff_23
0,2023-01-01 00:00:00,2.0,train,-0.661992,-0.822218,-0.197707,-0.266362,-0.258088,-0.500339,-0.547936,...,,,,,,,,,,
1,2023-01-01 01:00:00,1.0,train,-0.620076,-0.758944,-0.197719,-0.266509,-0.249062,2.713390,3.364286,...,,,,,,,,,,
2,2023-01-01 02:00:00,1.0,train,-0.651513,-0.805945,-0.197788,-0.266509,-0.258088,0.187680,0.091498,...,,,,,,,,,,
3,2023-01-01 03:00:00,1.0,train,-0.657335,-0.815147,-0.196080,-0.259268,-0.258088,-0.684034,-0.705774,...,,,,,,,,,,
4,2023-01-01 04:00:00,2.0,train,-0.672860,-0.838727,-0.089212,-0.262101,-0.258088,-0.141937,-0.224818,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,-0.338690,-0.273439,0.209904,-0.257839,-0.255001,1.260415,1.271252,...,-0.017466,-0.007959,0.081678,-0.073049,-0.013958,0.083300,-0.042936,0.002144,-0.029473,0.036839
11548,2024-04-26 04:00:00,,test,-0.734571,-0.425706,0.209904,-0.186195,-0.256237,1.260415,1.271252,...,0.016902,-0.017466,-0.007959,0.081678,-0.073049,-0.013958,0.083300,-0.042936,0.002144,-0.029473
11549,2024-04-26 05:00:00,,test,-0.207506,-0.223210,0.209904,-0.244564,-0.191959,1.260415,1.271252,...,-0.027555,0.016902,-0.017466,-0.007959,0.081678,-0.073049,-0.013958,0.083300,-0.042936,0.002144
11550,2024-04-26 06:00:00,,test,-0.358872,-0.281180,0.209904,-0.256703,-0.237095,1.260415,1.271252,...,-0.018767,-0.027555,0.016902,-0.017466,-0.007959,0.081678,-0.073049,-0.013958,0.083300,-0.042936


In [258]:
# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
#df = df.bfill().assign(target = _target)
df = df.interpolate(method='linear').bfill()

  df = df.interpolate(method='linear').bfill()


In [259]:
train_df = df[df['_type'] == 'train']
train_df

Unnamed: 0,ID,target,_type,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,short_liquidations,buy_ratio,buy_sell_ratio,...,close_diff_14,close_diff_15,close_diff_16,close_diff_17,close_diff_18,close_diff_19,close_diff_20,close_diff_21,close_diff_22,close_diff_23
0,2023-01-01 00:00:00,2.0,train,-0.661992,-0.822218,-0.197707,-0.266362,-0.258088,-0.500339,-0.547936,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2023-01-01 01:00:00,1.0,train,-0.620076,-0.758944,-0.197719,-0.266509,-0.249062,2.713390,3.364286,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2023-01-01 02:00:00,1.0,train,-0.651513,-0.805945,-0.197788,-0.266509,-0.258088,0.187680,0.091498,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2023-01-01 03:00:00,1.0,train,-0.657335,-0.815147,-0.196080,-0.259268,-0.258088,-0.684034,-0.705774,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2023-01-01 04:00:00,2.0,train,-0.672860,-0.838727,-0.089212,-0.262101,-0.258088,-0.141937,-0.224818,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,1.0,train,-1.331108,-0.859934,2.729701,-0.264518,-0.256769,0.615049,0.531909,...,-0.003977,0.009115,0.013945,-0.014627,-0.031400,0.035933,0.000929,-0.027905,0.003878,-0.012733
8756,2023-12-31 20:00:00,1.0,train,-1.239124,-0.807449,2.810331,0.096144,0.297665,0.492640,0.402056,...,0.008150,-0.003977,0.009115,0.013945,-0.014627,-0.031400,0.035933,0.000929,-0.027905,0.003878
8757,2023-12-31 21:00:00,0.0,train,-1.384668,-0.892440,2.859677,-0.262540,-0.236803,0.363656,0.268503,...,0.049748,0.008150,-0.003977,0.009115,0.013945,-0.014627,-0.031400,0.035933,0.000929,-0.027905
8758,2023-12-31 22:00:00,2.0,train,-0.631719,-0.456551,2.883867,0.836058,-0.159441,-2.094913,-1.768893,...,-0.011899,0.049748,0.008150,-0.003977,0.009115,0.013945,-0.014627,-0.031400,0.035933,0.000929


In [260]:
x_train, x_tmp, y_train, y_tmp = train_test_split(
    train_df.drop(["target", "ID", '_type'], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.4,
    random_state=42,
)

In [261]:
x_valid, x_test, y_valid, y_test = train_test_split(
    x_tmp, 
    y_tmp, 
    test_size=0.2,
    random_state=42,
)

In [263]:
from imblearn.over_sampling import SMOTE
print("원본 train 데이터 클래스 분포:")
print(y_train.value_counts(normalize=True))

# SMOTE를 사용한 오버샘플링
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("\nSMOTE 적용 후 train 데이터 클래스 분포:")
print(y_train_resampled.value_counts(normalize=True))


원본 train 데이터 클래스 분포:
target
2    0.417428
1    0.411530
3    0.088661
0    0.082382
Name: proportion, dtype: float64

SMOTE 적용 후 train 데이터 클래스 분포:
target
1    0.25
3    0.25
2    0.25
0    0.25
Name: proportion, dtype: float64


### Model Training

In [264]:
# train_test_split 으로 valid set, train set 분리

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



acc: 0.45272921869425614, auroc: 0.6426399101387823


In [61]:
# 모델 최적화를 위한 함수
def optimize_model(trial, model_name):
    if model_name == 'RandomForest':
        # RandomForest는 GPU 가속을 기본적으로 지원하지 않음
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            class_weight=class_weight,
            random_state=42
        )
        
    elif model_name == 'XGBoost':
        # XGBoost에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        gamma = trial.suggest_float('gamma', 0, 5)
        
        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            gamma=gamma,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method = "hist", 
            device = "cuda",
            random_state=42
        )

    elif model_name == 'LightGBM':
        # LightGBM에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        num_leaves = trial.suggest_int('num_leaves', 20, 150)
        min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        
        model = LGBMClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            min_child_samples=min_child_samples,
            class_weight=class_weight,
            device='gpu',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'CatBoost':
        # CatBoost에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        depth = trial.suggest_int('depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 10)
        class_weights = trial.suggest_categorical('class_weights', [None, [1, 10, 5, 20]])
        
        model = CatBoostClassifier(
            n_estimators=n_estimators,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            class_weights=class_weights,
            task_type='GPU',  # GPU 사용 설정
            verbose=0,
            random_state=42
        )
    
    elif model_name == 'LightGBM':
        # LightGBM에서 GPU 사용
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        num_leaves = trial.suggest_int('num_leaves', 20, 3000)
        min_child_samples = trial.suggest_int('min_child_samples', 1, 300)
        
        model = LGBMClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            min_child_samples=min_child_samples,
            objective='multiclass',
            metric='multi_logloss',
            device='gpu',  # GPU 사용
            random_state=42
        )
    
    # 교차 검증으로 모델 성능 평가
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

# 모델별로 Optuna 스터디 생성 및 최적화
def optimize_each_model(model_name):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optimize_model(trial, model_name), n_trials=100)
    return study

# 모델 리스트
models = ['CatBoost']
#models = ['XGBoost', 'CatBoost']
best_params = {}

# 각 모델에 대해 최적화 실행
for model_name in models:
    print(f"Optimizing {model_name}...")
    study = optimize_each_model(model_name)
    best_params[model_name] = study.best_trial.params

# 각 모델의 최적의 하이퍼파라미터 출력
for model_name, params in best_params.items():
    print(f"\nBest hyperparameters for {model_name}:")
    print(params)



[I 2024-09-25 14:42:49,663] A new study created in memory with name: no-name-171a2c66-9c91-437f-a4e1-59495e742e60


Optimizing CatBoost...


[I 2024-09-25 14:42:52,986] Trial 0 finished with value: 0.413527397260274 and parameters: {'n_estimators': 70, 'depth': 5, 'learning_rate': 0.05887913100506977, 'l2_leaf_reg': 5.035898076463189, 'class_weights': [1, 10, 5, 20]}. Best is trial 0 with value: 0.413527397260274.
[I 2024-09-25 14:43:20,703] Trial 1 finished with value: 0.4293664383561644 and parameters: {'n_estimators': 192, 'depth': 10, 'learning_rate': 0.04432006446069444, 'l2_leaf_reg': 8.75051856809919, 'class_weights': [1, 10, 5, 20]}. Best is trial 1 with value: 0.4293664383561644.
[I 2024-09-25 14:43:25,412] Trial 2 finished with value: 0.4479166666666667 and parameters: {'n_estimators': 60, 'depth': 6, 'learning_rate': 0.206411963111386, 'l2_leaf_reg': 5.071079368115389, 'class_weights': None}. Best is trial 2 with value: 0.4479166666666667.
[I 2024-09-25 14:43:34,308] Trial 3 finished with value: 0.4492009132420091 and parameters: {'n_estimators': 203, 'depth': 7, 'learning_rate': 0.22754315741246145, 'l2_leaf_reg

In [None]:
cat_params = best_params['CatBoost']
cat_model = CatBoostClassifier(
    n_estimators = cat_params['n_estimators'],
    depth = cat_params['depth'],
    learning_rate = cat_params['learning_rate'],
    l2_leaf_reg = cat_params['l2_leaf_reg'],
    task_type='GPU',  # GPU 사용 설정
    verbose=0,
    random_state=42
)

cat_model.fit(X_train, y_train)
y_pred = cat_model.predict(X_valid)
accuracy_score(y_valid, y_pred)

In [27]:
def soft_voting(predictions):
    voting_result = np.argmax(np.sum(predictions, axis=0), axis=1)
    return voting_result

In [28]:
# 모델 학습 및 예측 함수
def train_and_predict(model_name, best_params):
    model = None  # 모델을 None으로 초기화
    
    if model_name == 'RandomForest':
        model = RandomForestClassifier(
            **best_params,
            random_state=42
        )
    
    elif model_name == 'XGBoost':
        model = XGBClassifier(
            **best_params,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method='gpu_hist',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'LightGBM':
        model = LGBMClassifier(
            **best_params,
            device='gpu',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(
            **best_params,
            task_type='GPU',  # GPU 사용 설정
            verbose=0,
            random_state=42
        )
    
    if model is not None:  # 모델이 None이 아닐 때만 학습
        # 모델 학습
        model.fit(X_train, y_train)

        # 테스트 세트에 대한 예측
        y_pred_proba = model.predict_proba(X_valid)
        return y_pred_proba
    else:
        raise ValueError(f"Unsupported model name: {model_name}")



# 각 모델에 대해 학습 및 예측 수행
ensemble_list = []
for model_name, params in best_params.items():
    print(f"\nTraining and predicting with {model_name} using best parameters...")
    predictions = train_and_predict(model_name, params)
    ensemble_list.append(predictions)
    # # 예측 결과 출력
    # print(f"Predictions for {model_name}: {predictions[:10]}")  # 첫 10개 예측 결과 출력

ensemble_preds = soft_voting(ensemble_list)

# score check
accuracy = accuracy_score(y_valid, ensemble_preds)



Training and predicting with RandomForest using best parameters...

Training and predicting with XGBoost using best parameters...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"




Training and predicting with CatBoost using best parameters...


In [29]:
accuracy

0.4452054794520548

In [30]:
rf_params = best_params['RandomForest']

# 모델 생성
rf_model = RandomForestClassifier(
    n_estimators=rf_params['n_estimators'],
    max_depth=rf_params['max_depth'],
    min_samples_split=rf_params['min_samples_split'],
    min_samples_leaf=rf_params['min_samples_leaf'],
    class_weight=rf_params['class_weight'],
    random_state=42  # 재현성을 위해 random_state 설정
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_valid)
y_pred

array([1, 2, 1, ..., 2, 1, 1])

In [31]:
accuracy_score(y_valid, y_pred)

0.4509132420091324

In [32]:
xg_params = best_params['XGBoost']

# 모델 생성
xg_model = model = XGBClassifier(
            n_estimators = xg_params['n_estimators'],
            max_depth = xg_params['max_depth'],
            learning_rate = xg_params['learning_rate'],
            gamma = xg_params['gamma'],
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method='gpu_hist',  # GPU 사용 설정
            random_state=42
        )

xg_model.fit(X_train, y_train)
y_pred = xg_model.predict(X_valid)
y_pred


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



array([2, 1, 1, ..., 1, 2, 2])

In [33]:
accuracy_score(y_valid, y_pred)

0.4337899543378995

In [34]:
cat_params = best_params['CatBoost']
cat_model = CatBoostClassifier(
    n_estimators = cat_params['n_estimators'],
    depth = cat_params['depth'],
    learning_rate = cat_params['learning_rate'],
    l2_leaf_reg = cat_params['l2_leaf_reg'],
    task_type='GPU',  # GPU 사용 설정
    verbose=0,
    random_state=42
)

cat_model.fit(X_train, y_train)
y_pred = cat_model.predict(X_valid)
y_pred

array([[2],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [35]:
accuracy_score(y_valid, y_pred.flatten())

0.4680365296803653

In [230]:
X_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)

X_test = test_df.drop(["target", "ID"], axis = 1)

In [120]:
rf_params = best_params['RandomForest']

# 모델 생성
rf_model = RandomForestClassifier(
    n_estimators=rf_params['n_estimators'],
    max_depth=rf_params['max_depth'],
    min_samples_split=rf_params['min_samples_split'],
    min_samples_leaf=rf_params['min_samples_leaf'],
    class_weight=rf_params['class_weight'],
    random_state=42  # 재현성을 위해 random_state 설정
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_pred

array([1, 1, 2, ..., 2, 2, 2])

In [231]:
cat_params = best_params['CatBoost']
cat_model = CatBoostClassifier(
    n_estimators = cat_params['n_estimators'],
    depth = cat_params['depth'],
    learning_rate = cat_params['learning_rate'],
    l2_leaf_reg = cat_params['l2_leaf_reg'],
    task_type='GPU',  # GPU 사용 설정
    verbose=0,
    random_state=42
)

cat_model.fit(X_train, y_train)
y_pred = cat_model.predict(X_test)
y_pred

array([[1],
       [2],
       [2],
       ...,
       [1],
       [2],
       [1]])

In [232]:
# output file 할당후 save 
submission_df = submission_df.assign(target = y_pred)
submission_df.to_csv("output5.csv", index=False)

In [98]:
# 모델 학습 및 예측 함수
def train_and_predict(model_name, best_params):
    model = None  # 모델을 None으로 초기화
    
    if model_name == 'RandomForest':
        model = RandomForestClassifier(
            **best_params,
            random_state=42
        )
    
    elif model_name == 'XGBoost':
        model = XGBClassifier(
            **best_params,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method='gpu_hist',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'LightGBM':
        model = LGBMClassifier(
            **best_params,
            device='gpu',  # GPU 사용 설정
            random_state=42
        )
    
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(
            **best_params,
            task_type='GPU',  # GPU 사용 설정
            verbose=0,
            random_state=42
        )
    
    if model is not None:  # 모델이 None이 아닐 때만 학습
        # 모델 학습
        model.fit(X_train, y_train)

        # 테스트 세트에 대한 예측
        y_pred_proba = model.predict_proba(X_test)
        return y_pred_proba
    else:
        raise ValueError(f"Unsupported model name: {model_name}")

# 각 모델에 대해 학습 및 예측 수행
ensemble_list = []
for model_name, params in best_params.items():
    print(f"\nTraining and predicting with {model_name} using best parameters...")
    predictions = train_and_predict(model_name, params)
    ensemble_list.append(predictions)
    # # 예측 결과 출력
    # print(f"Predictions for {model_name}: {predictions[:10]}")  # 첫 10개 예측 결과 출력

ensemble_preds = soft_voting(ensemble_list)



Training and predicting with RandomForest using best parameters...

Training and predicting with XGBoost using best parameters...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"




Training and predicting with CatBoost using best parameters...


### Inference

### Output File Save

In [99]:
# output file 할당후 save 
submission_df = submission_df.assign(target = ensemble_preds)
submission_df.to_csv("output2.csv", index=False)