#### library

In [67]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np

# visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# warning
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import add_dummy_feature
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# statsmodel
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf 
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import VAR
from statsmodels.tsa.arima_model import ARIMAResults
from statsmodels.multivariate.manova import MANOVA


# model
import lightgbm as lgb
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim


print(torch.__version__)

2.4.1+cu121


#### data load

In [182]:
# 파일 호출
data_path: str = "../../../data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [183]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:03<00:00, 35.58it/s]


#### trial

##### common

공통 사항 : train_df에서 missing value의 ratio가 100%인 column은 제거  
이유 : train data의 값이 존재하지 않으면 결국 test에서도 그 영향을 아예 미치지 않을 것이라 판단

In [184]:
# missing value check
train_missing_df = df.loc[df["_type"] == "train"]
test_missing_df = df.loc[df["_type"] == "test"]

# 각 열에서 누락된 값의 수 & 백분율 계산
# train
missing_train_values = train_missing_df.isnull().sum()
missing_train_percentage = (missing_train_values / len(train_missing_df)) * 100

# test
missing_test_values = test_missing_df.isnull().sum()
missing_test_percentage = (missing_test_values / len(test_missing_df)) * 100

# 누락된 값 비율을 기준으로 열 정렬
sorted_missing_train_percentage = missing_train_percentage.sort_values(ascending=False)
sorted_missing_test_percentage = missing_test_percentage.sort_values(ascending=False)


# missing_value의 비율이 100%가 아닌 column만 추출
# train
missing_train_columns = sorted_missing_train_percentage[sorted_missing_train_percentage == 100.0].index.tolist()
non_missing_train_columns = sorted_missing_train_percentage[sorted_missing_train_percentage != 100.0].index.tolist()
non_missing_train_columns.remove('ID')
non_missing_train_columns.remove('target')
non_missing_train_columns.remove('_type')

print(f"train data에서 모두 missing value인 data의 수 : {len(missing_train_columns)}")
print(f"train data에서 모두 missing value인 data의 수 : {len(non_missing_train_columns)}")


# test
missing_test_columns = sorted_missing_test_percentage[sorted_missing_test_percentage == 100.0].index.tolist()
non_missing_test_columns = sorted_missing_test_percentage[sorted_missing_test_percentage != 100.0].index.tolist()
non_missing_test_columns.remove('ID')
missing_test_columns.remove('target') # test에서는 target이 모두 NaN 값이기 때문에 
non_missing_test_columns.remove('_type')

print(f"test data에서 모두 missing value인 data의 수 : {len(missing_test_columns)}")
print(f"test data에서 모두 missing value인 data의 수 : {len(non_missing_test_columns)}")

# train과 test에 따른 missing column 확인
# train과 test 모두 100%인 경우
def find_common_elements(list1, list2):
    # 리스트를 set으로 변환 후 교집합 구하기
    common_elements = set(list1).intersection(set(list2))
    return list(common_elements)

common_columns = find_common_elements(missing_train_columns, missing_test_columns)
print("-"*60)
print(f"두 data에서 모두 missing value인 data의 수 : {len(common_columns)}") # 해당 값이 40으로 결국 train data에서 모두 NaN인 column이 test에서도 모두 NaN임

train data에서 모두 missing value인 data의 수 : 40
train data에서 모두 missing value인 data의 수 : 212
test data에서 모두 missing value인 data의 수 : 47
test data에서 모두 missing value인 data의 수 : 205
------------------------------------------------------------
두 data에서 모두 missing value인 data의 수 : 40


In [185]:
unique_test_columns = set(missing_test_columns) - set(missing_train_columns)
unique_test_columns = list(unique_test_columns)
print(f"tesst data에서만 모두 missing value인 data의 수 : {len(unique_test_columns)}")
# 총 7개의 column에서만 test data에서 모두 NaN 값을 보임
# 특히, close와 같은 종가 데이터는 실제로 target과 깊은 연관성이 있는 데이터라고 볼 수 있음

tesst data에서만 모두 missing value인 data의 수 : 7


In [186]:
# df에서 train data에서 missing ratio가 100%인 데이터 삭제시킨 new_df 생성
new_df = df[['ID','target', '_type'] + non_missing_train_columns]
new_df.head(3)

Unnamed: 0,ID,target,_type,hourly_market-data_open-interest_binance_btc_busd_open_interest,hourly_market-data_liquidations_binance_btc_busd_short_liquidations_usd,hourly_market-data_liquidations_binance_btc_busd_long_liquidations,hourly_market-data_liquidations_binance_btc_busd_short_liquidations,hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,...,hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_binance_all_symbol_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usd_open_interest,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations_usd
0,2023-01-01 00:00:00,2.0,train,318037100.0,0.0,0.0,0.0,0.0,0.01,2015805.0,...,197.5161,0.0,34125400.0,37752,0.561037,9271.503609,0.0,0.0,0.0,0.0
1,2023-01-01 01:00:00,1.0,train,318328100.0,0.0,0.0,0.0,0.0,0.01,4032601.0,...,0.0,11833.56104,34567500.0,20534,0.256248,4237.095391,0.0,0.0,0.0,0.0
2,2023-01-01 02:00:00,1.0,train,317972900.0,0.0,0.0,0.0,0.0,0.01,857595.0,...,0.0,0.0,34520900.0,19369,0.312978,5176.614029,0.0,0.0,0.0,0.0


또한 new data에 대한 결측치는 이동평균값으로 계산하고자 함.  
아무래도 시계열 데이터이다보니 주변의 value와 연관이 있을 확률이 높다고 생각됨.  
data leakage를 방지하기 위해 train과 test를 나누어 동일한 이동평균 + forward fill + backward fill 진행  
추가로 이동평균 기반의 이상치는 처리해주기로 함  
대신, test data에 대해서 모든 값이 NaN에 대해서는 우선 남겨놓고 진행함  

In [187]:
# 이동평균을 기반으로 이상치를 처리하는 함수
def replace_outlier(df, window=6, threshold=2):
    df_cleaned = df.copy()
    
    # 숫자형 컬럼들에 대해 처리
    for column in df_cleaned.select_dtypes(include=[np.number]).columns:
        # 이동평균과 표준편차 계산
        rolling_mean = df_cleaned[column].rolling(window=window, min_periods=1).mean()
        rolling_std = df_cleaned[column].rolling(window=window, min_periods=1).std()

        # 이상치 기준 설정
        outliers = np.abs(df_cleaned[column] - rolling_mean) > (threshold * rolling_std)

        # 이상치를 이동평균으로 대체
        df_cleaned.loc[outliers, column] = rolling_mean[outliers]
    
    return df_cleaned

In [188]:
# new train과 new test 분류
new_train_df = new_df.loc[df["_type"]=="train"]
new_test_df = new_df.loc[df["_type"]=="test"]

# non missing column만 추출
new_train_df_col = new_train_df[non_missing_train_columns]
new_test_df_col = new_test_df[non_missing_test_columns] # non_missing_train_columns에는 test에서의 7개 nan변수가 들어있기 때문

# 이동평균으로 결측치 대체(window size = 6으로 6시간 간격 설정)
window_size = 6
new_train_df_col_stab = new_train_df_col.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_train_df_col_stab = new_train_df_col_stab.fillna(method='ffill').fillna(method='bfill')

new_test_df_col_stab = new_test_df_col.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_test_df_col_stab = new_test_df_col_stab.fillna(method='ffill').fillna(method='bfill')

# 이상치 처리
cleaned_new_train_df_col_stab = replace_outlier(new_train_df_col_stab)
cleaned_new_test_df_col_stab = replace_outlier(new_test_df_col_stab)

# 결측치를 처리한 new_df 정의
final_train_df = pd.concat([new_train_df[['ID','target','_type']], cleaned_new_train_df_col_stab], axis = 1)
final_test_df = pd.concat([new_test_df[['ID','target','_type'] + unique_test_columns], cleaned_new_test_df_col_stab], axis=1)
print(f" train data에서 모두 null값인 개수 : {sum([1 for col in final_train_df.isnull().sum().values if col != 0])}")
print(f" test data에서 모두 null값인 개수 : {sum([1 for col in final_test_df.isnull().sum().values if col != 0])}")

 train data에서 모두 null값인 개수 : 0
 test data에서 모두 null값인 개수 : 8


In [189]:
# 7개의 변수에 대해 정규화 진행 후 예측된 결과값 test에 넣기
unique_test_columns
def standardization(train_df, test_df, column, column_list = unique_test_columns):
    features_to_scale = [col for col in train_df.columns if col not in ['ID', 'target', '_type'] + column_list]

    # x와 y 설정
    x = train_df[features_to_scale]
    y = train_df[column]
    # test data 세팅
    test = test_df[features_to_scale]
    # 정규화
    scaler = StandardScaler()

    x_train_scaled = scaler.fit_transform(x)
    x_test_scaled = scaler.transform(test)
   
    lgbm = lgb.LGBMRegressor()
    lgbm.fit(x_train_scaled, y)

    test_pred = lgbm.predict(x_test_scaled)
    return test_pred

In [190]:
final_test_df_changed = final_test_df.copy()
for uniq_col in unique_test_columns:
    new_test_value = standardization(final_train_df, final_test_df, uniq_col)
    final_test_df_changed[uniq_col] = new_test_value

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50767
[LightGBM] [Info] Number of data points in the train set: 8760, number of used features: 205
[LightGBM] [Info] Start training from score 4932.568977
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50767
[LightGBM] [Info] Number of data points in the train set: 8760, number of used features: 205
[LightGBM] [Info] Start training from score 5923.875445
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005905 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50767
[LightGBM] [Info] Number of data points in the train set: 8760, number of used features: 205
[LightGBM] [Info] 

In [191]:
df = pd.concat([final_train_df, final_test_df_changed], ignore_index=True)
df.head(3)

Unnamed: 0,ID,target,_type,hourly_market-data_open-interest_binance_btc_busd_open_interest,hourly_market-data_liquidations_binance_btc_busd_short_liquidations_usd,hourly_market-data_liquidations_binance_btc_busd_long_liquidations,hourly_market-data_liquidations_binance_btc_busd_short_liquidations,hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,...,hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_binance_all_symbol_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usd_open_interest,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations_usd
0,2023-01-01 00:00:00,2.0,train,318037100.0,0.0,0.0,0.0,0.0,0.01,2015805.0,...,197.5161,0.0,34125400.0,37752.0,0.561037,9271.503609,0.0,0.0,0.0,0.0
1,2023-01-01 01:00:00,1.0,train,318328100.0,0.0,0.0,0.0,0.0,0.01,4032601.0,...,0.0,11833.56104,34567500.0,20534.0,0.256248,4237.095391,0.0,0.0,0.0,0.0
2,2023-01-01 02:00:00,1.0,train,317972900.0,0.0,0.0,0.0,0.0,0.01,857595.0,...,0.0,0.0,34520900.0,19369.0,0.312978,5176.614029,0.0,0.0,0.0,0.0


##### trial 1  

형성한 df에서 동일하게 진행

In [192]:
final_data = df.copy()

In [159]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close" : "close",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume" : "volume",
    "hourly_market-data_open-interest_binance_btc_busd_open_interest" : "new1",
}
df = final_data[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 22)

In [160]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    close_diff = df['close'].diff().fillna(0),
    close_diffg = np.sign(df['close'].diff().fillna(0)),
    volume_price_diff = df['volume'].diff().fillna(0),
    volume_price_diffg = np.sign(df['volume'].diff().fillna(0)),
    interest_diff = df['new1'].diff().fillna(0),
    interest_diffg = np.sign(df['new1'].diff().fillna(0))
)
df.shape

(11552, 35)

In [161]:
new_col_list = list(cols_dict.values())
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "volume_diffg", "close_diffg", "volume_price_diffg","interest_diffg"]
conti_cols: List[str] = [_ for _ in new_col_list if _ not in ["ID", "target", "_type"]] + [
    "volume_price_diff",
    "close_diff",
    "interest_diff"
]

In [163]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [165]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [174]:
print(train_df.shape, test_df.shape)

(8760, 540) (2792, 540)


In [175]:
# train_test_split 으로 valid set, train set 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42
)

In [176]:
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(7008, 538) (1752, 538) (7008,) (1752,)


In [178]:
# lgb dataset
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(X_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")

acc: 0.4463470319634703, auroc: 0.6406346379532144


In [179]:
# performance 체크후 전체 학습 데이터로 다시 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)

In [180]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [181]:
# output file 할당후 save 
# submission_df = submission_df.assign(target = y_test_pred_class)
# submission_df.to_csv("output_0925.csv", index=False)

##### trial 2 - 정규화 진행

In [193]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close" : "close",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume" : "volume",
    "hourly_market-data_open-interest_binance_btc_busd_open_interest" : "new1",
}
df = final_data[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 22)

In [194]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    close_diff = df['close'].diff().fillna(0),
    close_diffg = np.sign(df['close'].diff().fillna(0)),
    volume_price_diff = df['volume'].diff().fillna(0),
    volume_price_diffg = np.sign(df['volume'].diff().fillna(0)),
    interest_diff = df['new1'].diff().fillna(0),
    interest_diffg = np.sign(df['new1'].diff().fillna(0))
)
df.shape

(11552, 35)

In [195]:
new_col_list = list(cols_dict.values())
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "volume_diffg", "close_diffg", "volume_price_diffg","interest_diffg"]
conti_cols: List[str] = [_ for _ in new_col_list if _ not in ["ID", "target", "_type"]] + [
    "volume_price_diff",
    "close_diff",
    "interest_diff"
]

In [196]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [197]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [198]:
# train_test_split 으로 valid set, train set 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42
)

In [199]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [200]:
# lgb dataset
train_data = lgb.Dataset(X_train_scaled, label=y_train)
valid_data = lgb.Dataset(X_valid_scaled, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(X_valid_scaled)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")

acc: 0.4492009132420091, auroc: 0.6418026262499433


In [201]:
def objective(trial):
    # 하이퍼파라미터 검색 공간 정의
    params = {
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 4,
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 20, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "random_state": 42
    }
    
    # LightGBM 데이터셋 생성
    train_data = lgb.Dataset(X_train_scaled, label=y_train)
    valid_data = lgb.Dataset(X_valid_scaled, label=y_valid, reference=train_data)
    
    # 모델 학습
    model = lgb.train(
        params=params,
        train_set=train_data,
        valid_sets=valid_data,
    )
    
    # 예측 및 평가
    y_valid_pred = model.predict(X_valid_scaled)
    y_valid_pred_class = np.argmax(y_valid_pred, axis=1)
    accuracy = accuracy_score(y_valid, y_valid_pred_class)
    
    return accuracy

# Optuna study 생성 및 최적화 수행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 최적의 하이퍼파라미터 및 성능 출력
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2024-09-25 16:15:47,974] A new study created in memory with name: no-name-28b9670b-20f1-4047-a231-29971a7b0a01


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:15:54,503] Trial 0 finished with value: 0.4589041095890411 and parameters: {'boosting_type': 'dart', 'num_leaves': 34, 'learning_rate': 0.0015854257306123134, 'n_estimators': 98, 'max_depth': 12, 'min_child_samples': 23, 'subsample': 0.6739645564569534, 'colsample_bytree': 0.8261703208246023}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:15:56,622] Trial 1 finished with value: 0.4383561643835616 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 33, 'learning_rate': 0.020245932648409324, 'n_estimators': 45, 'max_depth': 8, 'min_child_samples': 16, 'subsample': 0.5449135093803403, 'colsample_bytree': 0.5492058080658329}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:15:58,049] Trial 2 finished with value: 0.4480593607305936 and parameters: {'boosting_type': 'dart', 'num_leaves': 36, 'learning_rate': 0.056371072563512176, 'n_estimators': 20, 'max_depth': 11, 'min_child_samples': 21, 'subsample': 0.8622976016831381, 'colsample_bytree': 0.8190722285524646}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:02,358] Trial 3 finished with value: 0.4503424657534247 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 25, 'learning_rate': 0.022061567971436945, 'n_estimators': 81, 'max_depth': 12, 'min_child_samples': 30, 'subsample': 0.6337022089922644, 'colsample_bytree': 0.6650302866052065}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:05,210] Trial 4 finished with value: 0.4440639269406393 and parameters: {'boosting_type': 'dart', 'num_leaves': 38, 'learning_rate': 0.0128048031728178, 'n_estimators': 51, 'max_depth': 10, 'min_child_samples': 27, 'subsample': 0.5323824024221233, 'colsample_bytree': 0.6711536903691271}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:07,523] Trial 5 finished with value: 0.4514840182648402 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 37, 'learning_rate': 0.009612772108020277, 'n_estimators': 42, 'max_depth': 7, 'min_child_samples': 48, 'subsample': 0.7196325529937204, 'colsample_bytree': 0.6839581922532108}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:12,538] Trial 6 finished with value: 0.4514840182648402 and parameters: {'boosting_type': 'dart', 'num_leaves': 38, 'learning_rate': 0.010511188343056388, 'n_estimators': 81, 'max_depth': 12, 'min_child_samples': 22, 'subsample': 0.8649381160186792, 'colsample_bytree': 0.5899409708131373}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:18,666] Trial 7 finished with value: 0.4474885844748858 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 33, 'learning_rate': 0.014666952762282267, 'n_estimators': 93, 'max_depth': 12, 'min_child_samples': 15, 'subsample': 0.8350047437966078, 'colsample_bytree': 0.8285096668105201}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012663 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:22,597] Trial 8 finished with value: 0.4474885844748858 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 41, 'learning_rate': 0.046209298401641216, 'n_estimators': 49, 'max_depth': 11, 'min_child_samples': 44, 'subsample': 0.8450115888109713, 'colsample_bytree': 0.852985299720717}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:27,270] Trial 9 finished with value: 0.4537671232876712 and parameters: {'boosting_type': 'dart', 'num_leaves': 31, 'learning_rate': 0.029052247169491847, 'n_estimators': 92, 'max_depth': 8, 'min_child_samples': 6, 'subsample': 0.9846588257893063, 'colsample_bytree': 0.6114016708245213}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:28,932] Trial 10 finished with value: 0.4577625570776256 and parameters: {'boosting_type': 'dart', 'num_leaves': 48, 'learning_rate': 0.001079559193434472, 'n_estimators': 70, 'max_depth': 3, 'min_child_samples': 37, 'subsample': 0.6993602315068185, 'colsample_bytree': 0.9994518911313786}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:30,156] Trial 11 finished with value: 0.45662100456621 and parameters: {'boosting_type': 'dart', 'num_leaves': 46, 'learning_rate': 0.001017949756166332, 'n_estimators': 69, 'max_depth': 3, 'min_child_samples': 37, 'subsample': 0.6857261064323298, 'colsample_bytree': 0.9952286098530264}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:32,629] Trial 12 finished with value: 0.4537671232876712 and parameters: {'boosting_type': 'dart', 'num_leaves': 50, 'learning_rate': 0.0010031793357808455, 'n_estimators': 100, 'max_depth': 3, 'min_child_samples': 37, 'subsample': 0.6203426727254606, 'colsample_bytree': 0.9833892480280159}. Best is trial 0 with value: 0.4589041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:34,863] Trial 13 finished with value: 0.4617579908675799 and parameters: {'boosting_type': 'dart', 'num_leaves': 26, 'learning_rate': 0.0026419114066661784, 'n_estimators': 65, 'max_depth': 5, 'min_child_samples': 35, 'subsample': 0.775295550632123, 'colsample_bytree': 0.9108436436488201}. Best is trial 13 with value: 0.4617579908675799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:36,457] Trial 14 finished with value: 0.4434931506849315 and parameters: {'boosting_type': 'dart', 'num_leaves': 20, 'learning_rate': 0.0032211885997457286, 'n_estimators': 32, 'max_depth': 5, 'min_child_samples': 31, 'subsample': 0.7690704661669041, 'colsample_bytree': 0.9132828468607861}. Best is trial 13 with value: 0.4617579908675799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:38,997] Trial 15 finished with value: 0.4600456621004566 and parameters: {'boosting_type': 'dart', 'num_leaves': 28, 'learning_rate': 0.003038789490699824, 'n_estimators': 61, 'max_depth': 6, 'min_child_samples': 6, 'subsample': 0.7661577013998769, 'colsample_bytree': 0.7581516147568945}. Best is trial 13 with value: 0.4617579908675799.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:41,666] Trial 16 finished with value: 0.4623287671232877 and parameters: {'boosting_type': 'dart', 'num_leaves': 28, 'learning_rate': 0.004340844532162658, 'n_estimators': 60, 'max_depth': 5, 'min_child_samples': 6, 'subsample': 0.7756758453451169, 'colsample_bytree': 0.7512561089427627}. Best is trial 16 with value: 0.4623287671232877.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012814 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:44,056] Trial 17 finished with value: 0.464041095890411 and parameters: {'boosting_type': 'dart', 'num_leaves': 23, 'learning_rate': 0.0051612341654087065, 'n_estimators': 59, 'max_depth': 5, 'min_child_samples': 12, 'subsample': 0.9630952536526121, 'colsample_bytree': 0.7476879640670403}. Best is trial 17 with value: 0.464041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:45,802] Trial 18 finished with value: 0.4589041095890411 and parameters: {'boosting_type': 'dart', 'num_leaves': 20, 'learning_rate': 0.005602242654968558, 'n_estimators': 55, 'max_depth': 5, 'min_child_samples': 13, 'subsample': 0.9895524983128239, 'colsample_bytree': 0.7524442557462541}. Best is trial 17 with value: 0.464041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:47,951] Trial 19 finished with value: 0.4526255707762557 and parameters: {'boosting_type': 'dart', 'num_leaves': 23, 'learning_rate': 0.005118326788858378, 'n_estimators': 76, 'max_depth': 4, 'min_child_samples': 10, 'subsample': 0.9319495002924112, 'colsample_bytree': 0.736535856819451}. Best is trial 17 with value: 0.464041095890411.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:49,597] Trial 20 finished with value: 0.4708904109589041 and parameters: {'boosting_type': 'dart', 'num_leaves': 29, 'learning_rate': 0.09915304964258505, 'n_estimators': 34, 'max_depth': 7, 'min_child_samples': 9, 'subsample': 0.9280508185387524, 'colsample_bytree': 0.7235443528741826}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:52,021] Trial 21 finished with value: 0.4554794520547945 and parameters: {'boosting_type': 'dart', 'num_leaves': 29, 'learning_rate': 0.005675213967589036, 'n_estimators': 37, 'max_depth': 7, 'min_child_samples': 10, 'subsample': 0.9289989714304745, 'colsample_bytree': 0.7062144997214217}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:53,421] Trial 22 finished with value: 0.4549086757990868 and parameters: {'boosting_type': 'dart', 'num_leaves': 23, 'learning_rate': 0.006973365137765357, 'n_estimators': 33, 'max_depth': 6, 'min_child_samples': 5, 'subsample': 0.9158186516716899, 'colsample_bytree': 0.7719046113329092}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:56,280] Trial 23 finished with value: 0.4646118721461187 and parameters: {'boosting_type': 'dart', 'num_leaves': 28, 'learning_rate': 0.09864120701291693, 'n_estimators': 56, 'max_depth': 9, 'min_child_samples': 10, 'subsample': 0.8948221135809955, 'colsample_bytree': 0.6434145393699532}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:57,356] Trial 24 finished with value: 0.4583333333333333 and parameters: {'boosting_type': 'dart', 'num_leaves': 24, 'learning_rate': 0.09611621559902568, 'n_estimators': 23, 'max_depth': 10, 'min_child_samples': 19, 'subsample': 0.9575020590727789, 'colsample_bytree': 0.6297836464048057}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:16:59,172] Trial 25 finished with value: 0.4589041095890411 and parameters: {'boosting_type': 'dart', 'num_leaves': 30, 'learning_rate': 0.09463044520985144, 'n_estimators': 27, 'max_depth': 9, 'min_child_samples': 11, 'subsample': 0.8951218140618903, 'colsample_bytree': 0.6387031521656623}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:01,545] Trial 26 finished with value: 0.4526255707762557 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 27, 'learning_rate': 0.05844118490072501, 'n_estimators': 55, 'max_depth': 8, 'min_child_samples': 18, 'subsample': 0.9980168216664541, 'colsample_bytree': 0.5072948114701401}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:03,365] Trial 27 finished with value: 0.45662100456621 and parameters: {'boosting_type': 'dart', 'num_leaves': 22, 'learning_rate': 0.06862291512646856, 'n_estimators': 40, 'max_depth': 9, 'min_child_samples': 13, 'subsample': 0.8185163526372344, 'colsample_bytree': 0.7134746288762126}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:05,229] Trial 28 finished with value: 0.4571917808219178 and parameters: {'boosting_type': 'dart', 'num_leaves': 32, 'learning_rate': 0.03575500513191793, 'n_estimators': 47, 'max_depth': 6, 'min_child_samples': 10, 'subsample': 0.8945610834784916, 'colsample_bytree': 0.5745826127294784}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:07,809] Trial 29 finished with value: 0.4617579908675799 and parameters: {'boosting_type': 'dart', 'num_leaves': 26, 'learning_rate': 0.0018745030154126172, 'n_estimators': 54, 'max_depth': 7, 'min_child_samples': 26, 'subsample': 0.9554073982330293, 'colsample_bytree': 0.7962343600780226}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:12,156] Trial 30 finished with value: 0.4486301369863014 and parameters: {'boosting_type': 'dart', 'num_leaves': 41, 'learning_rate': 0.039472756158548304, 'n_estimators': 60, 'max_depth': 9, 'min_child_samples': 24, 'subsample': 0.8125399935173241, 'colsample_bytree': 0.715112738708486}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:14,167] Trial 31 finished with value: 0.4469178082191781 and parameters: {'boosting_type': 'dart', 'num_leaves': 28, 'learning_rate': 0.007614387037738904, 'n_estimators': 63, 'max_depth': 4, 'min_child_samples': 8, 'subsample': 0.8889126616833478, 'colsample_bytree': 0.7974736622719891}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:15,750] Trial 32 finished with value: 0.4520547945205479 and parameters: {'boosting_type': 'dart', 'num_leaves': 30, 'learning_rate': 0.003935740054065049, 'n_estimators': 73, 'max_depth': 4, 'min_child_samples': 14, 'subsample': 0.9445590149508674, 'colsample_bytree': 0.6555995802394352}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:18,806] Trial 33 finished with value: 0.4469178082191781 and parameters: {'boosting_type': 'dart', 'num_leaves': 34, 'learning_rate': 0.07925949651548628, 'n_estimators': 60, 'max_depth': 6, 'min_child_samples': 17, 'subsample': 0.869222204240221, 'colsample_bytree': 0.7286496234508328}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:21,371] Trial 34 finished with value: 0.4537671232876712 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 25, 'learning_rate': 0.001837223558340524, 'n_estimators': 45, 'max_depth': 5, 'min_child_samples': 8, 'subsample': 0.8055266180973986, 'colsample_bytree': 0.8675743153257834}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:24,313] Trial 35 finished with value: 0.4560502283105023 and parameters: {'boosting_type': 'dart', 'num_leaves': 22, 'learning_rate': 0.019526815283438352, 'n_estimators': 67, 'max_depth': 8, 'min_child_samples': 8, 'subsample': 0.6266783852172211, 'colsample_bytree': 0.685124822475071}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:29,872] Trial 36 finished with value: 0.4429223744292237 and parameters: {'boosting_type': 'dart', 'num_leaves': 35, 'learning_rate': 0.004108086933683913, 'n_estimators': 81, 'max_depth': 10, 'min_child_samples': 5, 'subsample': 0.5644934566086667, 'colsample_bytree': 0.7904921966174664}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:33,045] Trial 37 finished with value: 0.4469178082191781 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 28, 'learning_rate': 0.008006762163006864, 'n_estimators': 57, 'max_depth': 7, 'min_child_samples': 12, 'subsample': 0.7285245762473309, 'colsample_bytree': 0.6906122097164137}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:34,154] Trial 38 finished with value: 0.45662100456621 and parameters: {'boosting_type': 'dart', 'num_leaves': 31, 'learning_rate': 0.016007104912622867, 'n_estimators': 51, 'max_depth': 4, 'min_child_samples': 20, 'subsample': 0.9630816565095061, 'colsample_bytree': 0.5389185449037447}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:36,161] Trial 39 finished with value: 0.4537671232876712 and parameters: {'boosting_type': 'dart', 'num_leaves': 25, 'learning_rate': 0.0113326527873676, 'n_estimators': 42, 'max_depth': 6, 'min_child_samples': 16, 'subsample': 0.9157290410406711, 'colsample_bytree': 0.6542821206980859}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:40,039] Trial 40 finished with value: 0.444634703196347 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 33, 'learning_rate': 0.057142594672770324, 'n_estimators': 76, 'max_depth': 9, 'min_child_samples': 8, 'subsample': 0.6614196753880881, 'colsample_bytree': 0.6079050324093944}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:42,893] Trial 41 finished with value: 0.4606164383561644 and parameters: {'boosting_type': 'dart', 'num_leaves': 25, 'learning_rate': 0.0022727626077509583, 'n_estimators': 67, 'max_depth': 5, 'min_child_samples': 32, 'subsample': 0.7778481930924244, 'colsample_bytree': 0.9050336826802312}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:45,041] Trial 42 finished with value: 0.4560502283105023 and parameters: {'boosting_type': 'dart', 'num_leaves': 27, 'learning_rate': 0.002662337515791535, 'n_estimators': 66, 'max_depth': 5, 'min_child_samples': 41, 'subsample': 0.7397483429037817, 'colsample_bytree': 0.8288253413987807}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:48,411] Trial 43 finished with value: 0.4526255707762557 and parameters: {'boosting_type': 'dart', 'num_leaves': 26, 'learning_rate': 0.0013916504506170943, 'n_estimators': 64, 'max_depth': 7, 'min_child_samples': 34, 'subsample': 0.8503816977303724, 'colsample_bytree': 0.9453397613119876}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:50,024] Trial 44 finished with value: 0.4531963470319635 and parameters: {'boosting_type': 'dart', 'num_leaves': 21, 'learning_rate': 0.0039429187040631595, 'n_estimators': 51, 'max_depth': 5, 'min_child_samples': 43, 'subsample': 0.7905467142319331, 'colsample_bytree': 0.8435392152816938}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:52,512] Trial 45 finished with value: 0.4583333333333333 and parameters: {'boosting_type': 'dart', 'num_leaves': 29, 'learning_rate': 0.004717118413065295, 'n_estimators': 84, 'max_depth': 4, 'min_child_samples': 50, 'subsample': 0.8702968113825748, 'colsample_bytree': 0.8726026956164057}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:55,563] Trial 46 finished with value: 0.4606164383561644 and parameters: {'boosting_type': 'dart', 'num_leaves': 24, 'learning_rate': 0.002442914161783329, 'n_estimators': 71, 'max_depth': 6, 'min_child_samples': 15, 'subsample': 0.8297712990303155, 'colsample_bytree': 0.7725363000612185}. Best is trial 20 with value: 0.4708904109589041.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:57,133] Trial 47 finished with value: 0.4714611872146119 and parameters: {'boosting_type': 'dart', 'num_leaves': 31, 'learning_rate': 0.0253753151814534, 'n_estimators': 86, 'max_depth': 3, 'min_child_samples': 22, 'subsample': 0.7192535085479793, 'colsample_bytree': 0.9569294985342764}. Best is trial 47 with value: 0.4714611872146119.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:17:58,970] Trial 48 finished with value: 0.4680365296803653 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 36, 'learning_rate': 0.0235000806218451, 'n_estimators': 89, 'max_depth': 3, 'min_child_samples': 23, 'subsample': 0.7114131107488767, 'colsample_bytree': 0.9676610942100794}. Best is trial 47 with value: 0.4714611872146119.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135681
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 538
[LightGBM] [Info] Start training from score -2.472992
[LightGBM] [Info] Start training from score -0.894135
[LightGBM] [Info] Start training from score -0.873416
[LightGBM] [Info] Start training from score -2.417056


[I 2024-09-25 16:18:00,416] Trial 49 finished with value: 0.4646118721461187 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 39, 'learning_rate': 0.026710955942523557, 'n_estimators': 91, 'max_depth': 3, 'min_child_samples': 23, 'subsample': 0.6612905119945626, 'colsample_bytree': 0.9651667397301649}. Best is trial 47 with value: 0.4714611872146119.


Best parameters: {'boosting_type': 'dart', 'num_leaves': 31, 'learning_rate': 0.0253753151814534, 'n_estimators': 86, 'max_depth': 3, 'min_child_samples': 22, 'subsample': 0.7192535085479793, 'colsample_bytree': 0.9569294985342764}
Best score: 0.4714611872146119


In [203]:
best_params = study.best_params
best_params["objective"] = "multiclass"
best_params["metric"] = "multi_logloss"
best_params["num_class"] = 4,
best_params["random_state"] = 42

# performance 체크후 전체 학습 데이터로 다시 재학습
x_train_scaled = scaler.fit_transform(train_df.drop(["target", "ID"], axis = 1))
y_train = train_df["target"].astype(int)

train_data = lgb.Dataset(x_train_scaled, label=y_train)
lgb_model = lgb.train(
    params=best_params,
    train_set=train_data,
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135682
[LightGBM] [Info] Number of data points in the train set: 8760, number of used features: 538
[LightGBM] [Info] Start training from score -2.471301
[LightGBM] [Info] Start training from score -0.904940
[LightGBM] [Info] Start training from score -0.869732
[LightGBM] [Info] Start training from score -2.387109


In [205]:
# lgb predict
X_test_scaled = scaler.transform(test_df.drop(["target", "ID"], axis=1))

y_test_pred = lgb_model.predict(X_test_scaled)
y_test_pred_scaled_class = np.argmax(y_test_pred, axis = 1)
y_test_pred_scaled_class

array([2, 2, 2, ..., 1, 1, 2])

In [208]:
count = 0
lst1 = list(y_test_pred_class)
lst2 = list(y_test_pred_scaled_class)

for a,b in zip(lst1, lst2):
    if a != b:
        count += 1
   
print(len(lst1))
print(count)

2792
593
