#### library

In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np

# visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# warning
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import add_dummy_feature
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# statsmodel
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf 
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults


# model
import lightgbm as lgb
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from autoformer_encdec import series_decomp, series_decomp_multi
from autoformer_encdec import series_decomp_fixed, series_decomp_fixed_multi


print(torch.__version__)

2.4.1+cu121


#### data load

In [2]:
# 파일 호출
data_path: str = "../../../data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:03<00:00, 29.45it/s]


#### data proprocessing

In [4]:
# eda를 위한 data 추출
eda_df = df.loc[df["_type"] == "train"]

# missing value check

# 각 열에서 누락된 값의 수 & 백분율 계산
missing_values = eda_df.isnull().sum()
missing_percentage = (missing_values / len(eda_df)) * 100

# 누락된 값 비율을 기준으로 열 정렬
sorted_missing_percentage = missing_percentage.sort_values(ascending=False)

# missing_value의 비율이 100%가 아닌 column만 추출
non_missing_columns = sorted_missing_percentage[sorted_missing_percentage != 100.0].index.tolist()
non_missing_columns.remove('ID')
non_missing_columns.remove('target')
non_missing_columns.remove('_type')

new_data = eda_df[['ID','target','_type'] + non_missing_columns]

In [5]:
# 이동평균으로 결측치 대체
new_df_stab = new_data[non_missing_columns]

# train
window_size = 3
new_df_stab = new_df_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_df_stab = new_df_stab.fillna(method='ffill').fillna(method='bfill')


# 결측치 처리한 new_df 정의
new_df = pd.concat([new_data[['ID','target','_type']], new_df_stab], axis=1)

In [6]:
# 각 데이터에 대한 정상성 확인
def kpss_test(timeseries, pvalue = .05, regression_option = 'ct'):
    kpsstest = kpss(timeseries, regression= regression_option)
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    for key,value in kpsstest[3].items():
        kpss_output['Critical Value (%s)'%key] = value
    return kpss_output['p-value']

In [7]:
non_stationality_column = []
stationality_column = []
for col in non_missing_columns:
    if kpss_test(new_df_stab[col]) < 0.05:
        non_stationality_column.append(col)
    else:
        stationality_column.append(col)

# 정상성을 만족하도록 변형(차분 or 변환)
non_station_df = new_df[['ID'] + non_stationality_column]

look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is smaller than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is smaller than the p-value 

In [8]:
# 모든 non-stationality column에 대해 변환 + 차분을 실행하여 정상성으로 변환
# 변환 -> yeo-johnson로 최적의 lambda를 찾은 뒤 실행
# 차분은 1~3차 차분을 실행한 뒤 가장 kpss 검정을 통한 p-value가 가장 낮은 차분으로 값 대체

def stationality_changing_funtion(df, col, time = 'ID'):
    # data 정의
    data = df[[time, col]]
    data = data.set_index(time)

    # box-cox 변환의 일반화인 yeo-johnson 변환 실시
    transformed_data, lambda_opt = stats.yeojohnson(data[col])
    data_boxcox = data.copy()
    data_boxcox[col]= transformed_data

    p_value_list = []

    # 3차 차분의 결과를 kpss 검정을 돌려 각 p-value 값 확인
    for i in range(1,4):
        diff_data = data_boxcox.diff(i).dropna()
        p_value = kpss_test(diff_data[col])       
        p_value_list.append((i, p_value))

    
    # 두 번째 값 중 가장 큰 값을 찾기
    max_value = max(item[1] for item in p_value_list)

    num = 0

    if max_value >= 0.05:
        num += 1

    # 두 번째 값이 max_value인 원소들 필터링
    result = [item for item in p_value_list if item[1] == max_value]

    p_value_list_sorted = sorted(result, key=lambda x: x[0])


    # 가장 p-value가 큰 차분을 선택(kpss의 귀무가설을 채택하여 정상성이 나타남)
    diff_opt = p_value_list_sorted[0][0]

    diff_data_opt = data_boxcox.diff(diff_opt)
    diff_data_opt.iloc[0] = data_boxcox.iloc[0]

    # num = 1이면 정상성 만족
    return num, diff_data_opt

In [9]:
# 모든 비정상 data의 column을 stationality function 결과로 대체
sta_df = new_df.copy()
count = 0

for column in non_stationality_column:
    n, result = stationality_changing_funtion(new_df, col)
    sta_df[col] = result.values
    if n == 1:
        count += 1
print(count)
final_sta_df = sta_df.copy()

look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value 

166


look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value 

In [10]:
# data correlcation and VIF 확인

vif_df = sta_df[non_missing_columns]

# VIF 계산 함수
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    
    # VIF 계산
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

# VIF 값 계산
vif_data = calculate_vif(vif_df)
vif_data.sort_values(by="VIF", ascending=False, inplace=True)

# VIF 10을 초과하는 컬럼 필터링
high_vif = vif_data[vif_data["VIF"] > 10]

# 상관관계 계산
correlations = vif_df.corr()

# VIF 10 이상인 컬럼 간 상관관계 및 VIF 값으로 새로운 데이터프레임 생성
results = []

# VIF 10 이상인 각 컬럼에 대해 상관관계와 VIF 값 추출
for i, col1 in enumerate(high_vif['Feature']):
    for col2 in high_vif['Feature'][i+1:]:
        corr_value = correlations.loc[col1, col2]
        vif_value1 = high_vif[high_vif['Feature'] == col1]['VIF'].values[0]
        vif_value2 = high_vif[high_vif['Feature'] == col2]['VIF'].values[0]
        results.append({'Column1': f'{col1}', 'Column2' : f'{col2}', 'Correlation': corr_value, 'VIF': max(vif_value1, vif_value2)})

# 결과를 데이터프레임으로 변환
result_df = pd.DataFrame(results)

In [11]:
# 변수 중복 제거를 위한 세트
to_remove = set()

# 높은 상관관계를 가진 변수 쌍 중 하나만 남기기
for col1, col2 in zip(result_df['Column1'], result_df['Column2']):
    if col1 not in to_remove and col2 not in to_remove:
        to_remove.add(col2)  # col2를 제거 목록에 추가 (col1은 남김)

# 결과 데이터프레임 생성
removed_vars = list(to_remove)
len(removed_vars)

188

In [12]:
non_missing_columns
vif_columns = set(list(result_df['Column1']) + list(result_df['Column2']))

# vif가 10이 넘는 변수 쌍이 아예 존재하지 않는 변수들
non_vif_columns = [col for col in non_missing_columns if col not in vif_columns]

# vif가 10이 넘는 변수들 중에서 중복된 값들 제거
new_1 = [col for col in non_missing_columns if ( (col in set(list(result_df['Column2']))) & (col in set(list(result_df['Column1']))) ) == False]

# 최종 변수 생성
final_vif_df = sta_df[['target']+ new_1]
# final_test_df = sta_test_df[['target'] + new_1]
final_vif_df.head(3)

Unnamed: 0,target,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_network-data_block-interval_block_interval,hourly_network-data_block-bytes_block_bytes,hourly_network-data_tokens-transferred_tokens_transferred_mean,hourly_market-data_funding-rates_huobi_global_funding_rates,hourly_market-data_taker-buy-sell-stats_okx_taker_buy_sell_ratio,...,hourly_market-data_liquidations_bitmex_all_symbol_short_liquidations_usd,hourly_market-data_taker-buy-sell-stats_bitmex_taker_buy_sell_ratio,hourly_market-data_funding-rates_all_exchange_funding_rates,hourly_network-data_supply_supply_total,hourly_network-data_fees-transaction_fees_transaction_median_usd,hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume,hourly_network-data_tokens-transferred_tokens_transferred_total,hourly_network-data_tokens-transferred_tokens_transferred_median,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations_usd
0,2.0,0.01,0.588133,0.411867,1.42797,427.333333,783554.416667,2.885312,0.01,0.475845,...,0.0,0.905398,0.005049,19248710.0,0.235973,5516.420322,33057.024011,0.020125,0.01,0.0
1,1.0,0.01,0.774509,0.225491,3.434766,782.5,890760.5,2.217758,0.01,0.823286,...,0.0,3.714817,0.005049,19248740.0,0.237108,4513.341881,12933.965951,0.021293,0.01,0.0
2,1.0,0.01,0.36457,0.63543,0.573738,365.125,558209.75,4.857703,0.01,4.375441,...,0.0,1.672801,0.005049,19248790.0,0.234878,4310.904314,26960.250177,0.02298,0.01,0.0


In [50]:
X = final_sta_df.iloc[:,1:]
y = final_sta_df['target']

vif_X = final_vif_df.iloc[:,1:]
vif_y = final_vif_df['target']

#### model

##### LTSF-Dlinear model

In [14]:
class Model(nn.Module):

    def __init__(self, configs, individual=False):
        super(Model, self).__init__()
        self.seq_len = configs['seq_len']
        self.pred_len = configs['seq_len']
        # Series decomposition block from Autoformer
        self.decompsition = series_decomp(configs['moving_avg'])
        self.individual = individual
        self.channels = configs['enc_in']

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList()
            self.Linear_Trend = nn.ModuleList()

            for i in range(self.channels):
                self.Linear_Seasonal.append(
                    nn.Linear(self.seq_len, self.pred_len))
                self.Linear_Trend.append(
                    nn.Linear(self.seq_len, self.pred_len))

                self.Linear_Seasonal[i].weight = nn.Parameter(
                    (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
                self.Linear_Trend[i].weight = nn.Parameter(
                    (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
        else:
            self.Linear_Seasonal = nn.Linear(self.seq_len, self.pred_len)
            self.Linear_Trend = nn.Linear(self.seq_len, self.pred_len)

            self.Linear_Seasonal.weight = nn.Parameter(
                (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
            self.Linear_Trend.weight = nn.Parameter(
                (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))

        self.projection = nn.Linear(
            configs['enc_in'] * configs['seq_len'], configs['num_class'])

    def encoder(self, x):
        seasonal_init, trend_init = self.decompsition(x)
        seasonal_init, trend_init = seasonal_init.permute(
            0, 2, 1), trend_init.permute(0, 2, 1)
        if self.individual:
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.pred_len],
                                          dtype=seasonal_init.dtype).to(seasonal_init.device)
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.pred_len],
                                       dtype=trend_init.dtype).to(trend_init.device)
            for i in range(self.channels):
                seasonal_output[:, i, :] = self.Linear_Seasonal[i](
                    seasonal_init[:, i, :])
                trend_output[:, i, :] = self.Linear_Trend[i](
                    trend_init[:, i, :])
        else:
            seasonal_output = self.Linear_Seasonal(seasonal_init)
            trend_output = self.Linear_Trend(trend_init)
        x = seasonal_output + trend_output
        return x.permute(0, 2, 1)

    def classification(self, x_enc):
        # Encoder
        enc_out = self.encoder(x_enc)
        # Output
        # (batch_size, seq_length * d_model)
        output = enc_out.reshape(enc_out.shape[0], -1)
        # (batch_size, num_classes)
        output = self.projection(output)
        return output

    def forward(self, x_enc, mask=None):
        dec_out = self.classification(x_enc)
        return dec_out  # [B, N]

In [15]:
class Multi_Kernel_Model(nn.Module):

    def __init__(self, configs, individual=False):
        super(Multi_Kernel_Model, self).__init__()
        self.seq_len = configs['seq_len']
        self.pred_len = configs['seq_len']
        # Series decomposition block from Autoformer
        self.decompsition = series_decomp_fixed_multi(configs['moving_avg'])
        self.individual = individual
        self.channels = configs['enc_in']

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList()
            self.Linear_Trend = nn.ModuleList()

            for i in range(self.channels):
                self.Linear_Seasonal.append(
                    nn.Linear(self.seq_len, self.pred_len))
                self.Linear_Trend.append(
                    nn.Linear(self.seq_len, self.pred_len))

                self.Linear_Seasonal[i].weight = nn.Parameter(
                    (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
                self.Linear_Trend[i].weight = nn.Parameter(
                    (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
        else:
            self.Linear_Seasonal = nn.Linear(self.seq_len, self.pred_len)
            self.Linear_Trend = nn.Linear(self.seq_len, self.pred_len)

            self.Linear_Seasonal.weight = nn.Parameter(
                (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))
            self.Linear_Trend.weight = nn.Parameter(
                (1 / self.seq_len) * torch.ones([self.pred_len, self.seq_len]))

        self.projection = nn.Linear(
            configs['enc_in'] * configs['seq_len'], configs['num_class'])

    def encoder(self, x):
        seasonal_init, trend_init = self.decompsition(x)
        seasonal_init, trend_init = seasonal_init.permute(
            0, 2, 1), trend_init.permute(0, 2, 1)
        if self.individual:
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.pred_len],
                                          dtype=seasonal_init.dtype).to(seasonal_init.device)
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.pred_len],
                                       dtype=trend_init.dtype).to(trend_init.device)
            for i in range(self.channels):
                seasonal_output[:, i, :] = self.Linear_Seasonal[i](
                    seasonal_init[:, i, :])
                trend_output[:, i, :] = self.Linear_Trend[i](
                    trend_init[:, i, :])
        else:
            seasonal_output = self.Linear_Seasonal(seasonal_init)
            trend_output = self.Linear_Trend(trend_init)
        x = seasonal_output + trend_output
        return x.permute(0, 2, 1)

    def classification(self, x_enc):
        # Encoder
        enc_out = self.encoder(x_enc)
        # Output
        # (batch_size, seq_length * d_model)
        output = enc_out.reshape(enc_out.shape[0], -1)
        # (batch_size, num_classes)
        output = self.projection(output)
        return output

    def forward(self, x_enc, mask=None):
        dec_out = self.classification(x_enc)
        return dec_out  # [B, N]

###### training

In [16]:
# X, y split
train_X = X.iloc[:,2:]
train_y = y

In [17]:
# 2. Train/Validation Split
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

# 3. 데이터 텐서 변환
train_X_tensor = torch.tensor(train_X.values, dtype=torch.float32)
train_y_tensor = torch.tensor(train_y.values.squeeze(), dtype=torch.long)
valid_X_tensor = torch.tensor(valid_X.values, dtype=torch.float32)
valid_y_tensor = torch.tensor(valid_y.values.squeeze(), dtype=torch.long)

# 4. 데이터셋과 DataLoader 준비
# seq_len 설정
seq_len = 24
batch_size = 32

# 5. 시계열 데이터에 맞게 3차원 텐서로 변환
def create_sequences(X, y, seq_len):
    sequences = []
    labels = []
    for i in range(len(X) - seq_len):
        seq = X[i:i + seq_len]
        label = y[i + seq_len]  # 다음 시간 스텝의 레이블
        sequences.append(seq)
        labels.append(label)
    return torch.stack(sequences), torch.tensor(labels)

train_X_seq, train_y_seq = create_sequences(train_X_tensor, train_y_tensor, seq_len)
valid_X_seq, valid_y_seq = create_sequences(valid_X_tensor, valid_y_tensor, seq_len)

# DataLoader 설정
train_dataset = TensorDataset(train_X_seq, train_y_seq)
valid_dataset = TensorDataset(valid_X_seq, valid_y_seq)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [131]:
# 7. 모델 학습 및 검증
configs = {
    'seq_len': seq_len,
    'moving_avg': 3, 
    'enc_in': train_X.shape[1],  # 입력 특성의 수
    'num_class': 4  # 클래스 수
}

# 모델 생성
model = Model(configs=configs, individual=False)

In [132]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(x_batch)  # [B, num_classes]
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    # Validation Loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x_batch, y_batch in valid_loader:
            output = model(x_batch)
            _, predicted = torch.max(output.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    accuracy = 100 * correct / total
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {accuracy:.2f}%')

Epoch [10/100], Validation Accuracy: 8.85%
Epoch [20/100], Validation Accuracy: 33.68%
Epoch [30/100], Validation Accuracy: 40.45%
Epoch [40/100], Validation Accuracy: 35.30%
Epoch [50/100], Validation Accuracy: 38.54%
Epoch [60/100], Validation Accuracy: 38.31%
Epoch [70/100], Validation Accuracy: 26.68%
Epoch [80/100], Validation Accuracy: 38.66%
Epoch [90/100], Validation Accuracy: 22.86%
Epoch [100/100], Validation Accuracy: 39.12%


In [20]:
# 7. multi 모델 학습 및 검증
configs = {
    'seq_len': seq_len,
    'moving_avg': [3,7,30], 
    'enc_in': train_X.shape[1],  # 입력 특성의 수
    'num_class': 4  # 클래스 수
}

# 모델 생성
model = Multi_Kernel_Model(configs=configs, individual=False)

TypeError: super(type, obj): obj must be an instance or subtype of type

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(x_batch)  # [B, num_classes]
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    # Validation Loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x_batch, y_batch in valid_loader:
            output = model(x_batch)
            _, predicted = torch.max(output.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    accuracy = 100 * correct / total
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {accuracy:.2f}%')

NameError: name 'model' is not defined

##### RandomForest

In [52]:
from sklearn.ensemble import RandomForestClassifier


class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits

    def get_n_splits(self, groups):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

# 5개의 교차검증 실시
btscv = BlockingTimeSeriesSplit(n_splits=5)

In [53]:
# 성능 평가를 위한 변수 초기화
all_valid_scores = []

# 교차 검증 수행
for fold, (train_index, test_index) in enumerate(btscv.split(vif_X)):
    X_train, X_valid = vif_X.iloc[train_index], vif_X.iloc[test_index]
    y_train, y_valid = vif_y.iloc[train_index], vif_y.iloc[test_index]
    
    # lgb dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    rf_model = RandomForestClassifier(
        n_estimators=100,  # 결정 트리의 개수
        min_samples_split=5,
        min_samples_leaf=3,
        max_features=None,
        max_depth=None,     # 트리의 최대 깊이
        random_state=42,    # 랜덤 시드
        n_jobs=-1           # 모든 CPU 코어 사용
    )

    # 모델 학습
    rf_model.fit(X_train, y_train)

    # 예측
    y_valid_pred = rf_model.predict_proba(X_valid)
    y_valid_pred_class = np.argmax(y_valid_pred, axis=1)

    # 성능 지표 계산
    accuracy = accuracy_score(y_valid, y_valid_pred_class)
    auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")
    
    all_valid_scores.append((accuracy, auroc))
    
    print(f"Fold {fold + 1}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUROC: {auroc:.4f}\n")

# 성능 지표 평균 계산
avg_accuracy = np.mean([score[0] for score in all_valid_scores])
avg_auroc = np.mean([score[1] for score in all_valid_scores])

print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average AUROC: {avg_auroc:.4f}")

# 전체 데이터로 최종 모델 학습
final_rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

Fold 1
Accuracy: 0.4017
AUROC: 0.6240

Fold 2
Accuracy: 0.5071
AUROC: 0.5755

Fold 3
Accuracy: 0.4387
AUROC: 0.4814

Fold 4
Accuracy: 0.4387
AUROC: 0.5954

Fold 5
Accuracy: 0.4131
AUROC: 0.5932

Average Accuracy: 0.4399
Average AUROC: 0.5739


In [54]:
final_rf_model.fit(vif_X, vif_y)

# 피처 중요도 계산 및 출력
importances = final_rf_model.feature_importances_
feature_names = vif_X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [55]:
# 중요도 100 이하 제거
filtered_columns = list(feature_importance_df[feature_importance_df['Importance'] >= 0.02]['Feature'])

# X 재설정 
vif_X = vif_X[filtered_columns]
vif_X.head(3)

Unnamed: 0,hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume,hourly_network-data_supply_supply_total,hourly_market-data_taker-buy-sell-stats_bitmex_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_okx_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_deribit_taker_buy_sell_ratio,hourly_network-data_tokens-transferred_tokens_transferred_total,hourly_market-data_coinbase-premium-index_coinbase_premium_gap,hourly_network-data_block-bytes_block_bytes,hourly_network-data_fees-transaction_fees_transaction_median_usd,hourly_network-data_tokens-transferred_tokens_transferred_mean,...,hourly_network-data_tokens-transferred_tokens_transferred_median,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations_usd,hourly_market-data_funding-rates_okx_funding_rates,hourly_market-data_funding-rates_all_exchange_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_funding-rates_bitmex_funding_rates,hourly_market-data_funding-rates_bybit_funding_rates
0,5516.420322,19248710.0,0.905398,0.475845,1.493922,33057.024011,-9.86,783554.416667,0.235973,2.885312,...,0.020125,0.0,-0.001675,0.005049,0.411867,1.42797,0.588133,0.01,0.0014,0.01
1,4513.341881,19248740.0,3.714817,0.823286,3.964774,12933.965951,-8.78,890760.5,0.237108,2.217758,...,0.021293,0.0,-0.001675,0.005049,0.225491,3.434766,0.774509,0.01,0.0014,0.01
2,4310.904314,19248790.0,1.672801,4.375441,0.286114,26960.250177,-9.59,558209.75,0.234878,4.857703,...,0.02298,0.0,-0.001675,0.005049,0.63543,0.573738,0.36457,0.01,0.0014,0.01


In [59]:
# BlockingTimeSeriesSplit 설정
btscv = BlockingTimeSeriesSplit(n_splits=1)

for fold, (train_index, test_index) in enumerate(btscv.split(vif_X)):
    X_train, X_valid = vif_X.iloc[train_index], vif_X.iloc[test_index]
    y_train, y_valid = vif_y.iloc[train_index], vif_y.iloc[test_index]


    def objective(trial):
        # 하이퍼파라미터 검색 공간 정의
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),  # 결정 트리의 개수
            "max_depth": trial.suggest_int("max_depth", 3, 20),  # 트리의 최대 깊이
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),  # 노드를 분할하기 위한 최소 샘플 수
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),  # 리프 노드에 필요한 최소 샘플 수
            "max_features": trial.suggest_categorical("max_features", [None, "sqrt", "log2"]),  # 피처 선택 방법
            "random_state": 42,
            "n_jobs": -1  # 모든 CPU 코어 사용
        }


        # RandomForest 모델 생성
        rf_model = RandomForestClassifier(**params)

        # 모델 학습
        rf_model.fit(X_train, y_train)

        # 예측 및 평가
        y_valid_pred = rf_model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_valid_pred)
        
        # 마지막 fold의 accuracy를 반환
        return accuracy

# Optuna study 생성 및 최적화 수행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# 최적의 하이퍼파라미터 및 성능 출력
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2024-09-23 16:30:39,235] A new study created in memory with name: no-name-4f2201db-0712-436f-b982-3d20cafe1a42
[I 2024-09-23 16:30:40,006] Trial 0 finished with value: 0.4315068493150685 and parameters: {'n_estimators': 116, 'max_depth': 17, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 0.4315068493150685.
[I 2024-09-23 16:30:41,171] Trial 1 finished with value: 0.4297945205479452 and parameters: {'n_estimators': 71, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 0 with value: 0.4315068493150685.
[I 2024-09-23 16:30:41,558] Trial 2 finished with value: 0.4549086757990868 and parameters: {'n_estimators': 108, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 2 with value: 0.4549086757990868.
[I 2024-09-23 16:30:41,842] Trial 3 finished with value: 0.4269406392694064 and parameters: {'n_estimators': 55, 'max_depth': 8, 'min_samp

Best parameters: {'n_estimators': 170, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': None}
Best score: 0.45662100456621


In [60]:
# 최적 하이퍼파라미터로 최종 모델 학습
best_params = study.best_params
final_model = RandomForestClassifier(**best_params)

# 모델 학습
final_model.fit(X_train, y_train)

# 최종 모델 예측 및 성능 평가
y_valid_pred = final_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Final model accuracy: {accuracy}")

Final model accuracy: 0.4594748858447489


In [61]:
test_df = df.loc[df["_type"] == "test"]
new_test_df = test_df[['ID','target','_type'] + non_missing_columns]

new_test_stab = new_test_df[non_missing_columns]
# test
window_size = 3
new_test_stab = new_test_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_test_stab = new_test_stab.fillna(method='ffill').fillna(method='bfill')

new_test_df = pd.concat([new_test_df[['ID','target','_type']], new_test_stab], axis=1)

# 모든 비정상 data의 column을 stationality function 결과로 대체

sta_test_df = new_test_df.copy()
count = 0

for column in non_stationality_column:
    n, result = stationality_changing_funtion(new_test_df, col)
    sta_test_df[col] = result.values
    if n == 1:
        count += 1
count # 166이 나오므로 모두 정상성을 만족하게 됨

# 빈 딕셔너리 생성 -> 각 column의 이동평균 값의 평균을 저장
moving_avg_means = {}

# 각 컬럼에 대해 이동평균 값의 평균 구하기
for column in new_data.columns[3:]:
    new_data[f'{column}_moving_avg'] = new_data[column].rolling(window=3, min_periods=1).mean()
    # 이동평균 값들의 평균 계산
    moving_avg_means[column] = new_data[f'{column}_moving_avg'].mean()
    
# test 데이터의 NaN 값을 train 데이터에서 구한 이동평균 값의 평균으로 대체
for column in sta_test_df.columns[3:]:
    sta_test_df[column].fillna(moving_avg_means[column], inplace=True)
sta_test_df.head(3)

look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value returned.

  kpsstest = kpss(timeseries, regression= regression_option)
look-up table. The actual p-value is greater than the p-value 

Unnamed: 0,ID,target,_type,hourly_market-data_open-interest_binance_btc_busd_open_interest,hourly_market-data_liquidations_binance_btc_busd_short_liquidations_usd,hourly_market-data_liquidations_binance_btc_busd_long_liquidations,hourly_market-data_liquidations_binance_btc_busd_short_liquidations,hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,...,hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_binance_all_symbol_short_liquidations_usd,hourly_market-data_open-interest_htx_global_btc_usd_open_interest,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations,hourly_market-data_liquidations_htx_global_all_symbol_long_liquidations_usd,hourly_market-data_liquidations_htx_global_all_symbol_short_liquidations_usd
8760,2024-01-01 00:00:00,,test,231001600.0,13087.679359,0.432963,0.494107,11195.242768,0.017305,10647499.0,...,19507.00627,427431.4129,49578100.0,15508,11.281783,478682.39907,0.0,0.061,0.0,14.715219
8761,2024-01-01 01:00:00,,test,231001600.0,13087.679359,0.432963,0.494107,11195.242768,0.02045,11298841.0,...,19361.18115,820793.89171,49946400.0,19705,21.156934,900114.690178,0.028,0.931,1190.896,10.525615
8762,2024-01-01 02:00:00,,test,231001600.0,13087.679359,0.432963,0.494107,11195.242768,0.024063,2760806.0,...,94774.03311,64681.52556,50017900.0,18683,12.622917,537278.322466,4.362,0.001,185554.0644,-20.206319


In [74]:
final_test_df = sta_test_df[filtered_columns]

# randomforest predict
y_test_pred = final_model.predict(final_test_df)
y_test_pred = y_test_pred.astype(int)

In [75]:
# output file 할당후 save
submission_df = submission_df.assign(target = y_test_pred)
submission_df.to_csv("output_rf_0923.csv", index=False)