In [1]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 파일 호출
data_path: str = "../data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")

100%|██████████| 107/107 [00:03<00:00, 31.48it/s]


In [4]:
# missing value check

train_df = df.loc[df["_type"] == "train"]
# 각 열에서 누락된 값의 수 & 백분율 계산
missing_values = train_df.isnull().sum()
missing_percentage = (missing_values / len(train_df)) * 100

# 누락된 값 비율을 기준으로 열 정렬
sorted_missing_percentage = missing_percentage.sort_values(ascending=False)

# missing_value의 비율이 100%가 아닌 column만 추출
non_missing_columns = sorted_missing_percentage[sorted_missing_percentage != 100.0].index.tolist()
non_missing_columns.remove('ID')
non_missing_columns.remove('target')
non_missing_columns.remove('_type')

In [5]:
new_data = train_df[['ID','target', '_type'] + non_missing_columns]

In [6]:
columns_with_100_missing = sorted_missing_percentage[sorted_missing_percentage == 100]
print(columns_with_100_missing)
print(len(columns_with_100_missing))

hourly_market-data_liquidations_ftx_all_symbol_long_liquidations          100.0
hourly_market-data_liquidations_okex_btc_usd_short_liquidations_usd       100.0
hourly_market-data_open-interest_ftx_open_interest                        100.0
hourly_market-data_open-interest_huobi_global_open_interest               100.0
hourly_market-data_funding-rates_okex_funding_rates                       100.0
hourly_market-data_open-interest_okx_open_interest                        100.0
hourly_market-data_liquidations_okex_btc_usd_long_liquidations            100.0
hourly_market-data_liquidations_okex_btc_usd_short_liquidations           100.0
hourly_market-data_open-interest_okex_open_interest                       100.0
hourly_market-data_open-interest_kraken_open_interest                     100.0
hourly_market-data_open-interest_deribit_open_interest                    100.0
hourly_market-data_open-interest_ftx_btc_usd_open_interest                100.0
hourly_market-data_open-interest_binance

In [7]:
# 이동평균으로 결측치 대체
new_df_stab = new_data[non_missing_columns]

# train
window_size = 3
new_df_stab = new_df_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_df_stab = new_df_stab.fillna(method='ffill').fillna(method='bfill')


# 결측치 처리한 new_df 정의
new_train_df = pd.concat([new_data[['ID','target','_type']], new_df_stab], axis=1)

  new_df_stab = new_df_stab.fillna(method='ffill').fillna(method='bfill')


In [8]:
# nan or inf 데이터 여부 확인
for col in non_missing_columns:
    if (np.isnan(new_train_df[col]).any()) | (np.isinf(new_train_df[col]).any()):
        print(col)
        break

In [9]:
test_df = df.loc[df["_type"] == "test"]
new_test_df = test_df[['ID','target','_type'] + non_missing_columns]

new_test_stab = new_test_df[non_missing_columns]
# test
window_size = 3
new_test_stab = new_test_stab.apply(lambda col: col.fillna(col.rolling(window=window_size, min_periods=1).mean()))
new_test_stab = new_test_stab.fillna(method='ffill')

new_test_df = pd.concat([new_test_df[['ID','target','_type']], new_test_stab], axis=1)

  new_test_stab = new_test_stab.fillna(method='ffill')


In [10]:
# 결측치 비율을 계산
missing_percentage = new_test_df.isnull().mean() * 100

# 결측치 비율이 100%인 컬럼 이름만 출력
columns_with_all_missing = missing_percentage[missing_percentage >= 50].index.tolist()

# 100% 결측치가 있는 컬럼 출력
columns_with_all_missing = [col for col in columns_with_all_missing if col not in ['target', 'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']]

In [11]:
# train_df와 test_df에서 columns_with_all_missing에 있는 컬럼 삭제
new_train_df = new_train_df.drop(columns=columns_with_all_missing, errors='ignore')
new_test_df = new_test_df.drop(columns=columns_with_all_missing, errors='ignore')

In [12]:
# 이동평균을 기반으로 이상치를 처리하는 함수
def replace_outlier(df, window=3, threshold=2):
    df_cleaned = df.copy()
    
    # 숫자형 컬럼들에 대해 처리
    for column in df_cleaned.select_dtypes(include=[np.number]).columns:
        # 이동평균과 표준편차 계산
        rolling_mean = df_cleaned[column].rolling(window=window, min_periods=1).mean()
        rolling_std = df_cleaned[column].rolling(window=window, min_periods=1).std()

        # 이상치 기준 설정
        outliers = np.abs(df_cleaned[column] - rolling_mean) > (threshold * rolling_std)

        # 이상치를 이동평균으로 대체
        df_cleaned.loc[outliers, column] = rolling_mean[outliers]
    
    return df_cleaned

# 이동평균 기반 이상치 처리 적용
cleaned_train_df = replace_outlier(new_train_df)
cleaned_test_df = replace_outlier(new_test_df)

LGBM+Optuna

In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 타겟과 피처 설정
y_train = cleaned_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']
X_train = cleaned_train_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')

# 훈련 데이터와 검증 데이터 나누기
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

def objective(trial):
    # 하이퍼파라미터 샘플링
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train_split, y_train_split, eval_set=[(X_val, y_val)])

    # 검증 데이터에서 예측 및 평가
    y_val_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_val_pred)
    
    return mse

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# 최적의 하이퍼파라미터 출력
print("Best parameters:", study.best_params)
print("Best MSE:", study.best_value)

# 최적의 파라미터로 전체 데이터 학습
best_params = study.best_params
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train, y_train)

X_test = cleaned_test_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')
y_pred = model.predict(X_test)

# new_test_df에 y_pred 값을 추가
cleaned_test_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'] = y_pred

# 결과 확인
print(cleaned_test_df.head())

RF

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 타겟과 피처 설정
y_train = cleaned_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']
X_train = cleaned_train_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')

# 훈련 데이터와 검증 데이터 나누기
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 모델 훈련
model = RandomForestRegressor()
model.fit(X_train_split, y_train_split)

# 검증 데이터에서 예측
y_val_pred = model.predict(X_val)

# 성능 평가
mse = mean_squared_error(y_val, y_val_pred)
print("Mean Squared Error on Validation Set:", mse)

# 모델 훈련
model = RandomForestRegressor()
model.fit(X_train, y_train)

# test_df에서 예측
X_test = cleaned_test_df.drop(columns=['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'ID', 'target', '_type'], errors='ignore')
y_pred = model.predict(X_test)

# new_test_df에 y_pred 값을 추가
cleaned_test_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'] = y_pred

# 결과 확인
cleaned_test_df.head()

Mean Squared Error on Validation Set: 22483.918646254886


Unnamed: 0,ID,target,_type,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,hourly_network-data_fees_fees_reward_percent,...,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_block-count_block_count,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usd_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations_usd,hourly_network-data_utxo-count_utxo_count
8760,2024-01-01 00:00:00,,test,0.017305,0.358324,1.790774,0.641676,5945752.0,10647499.0,0.310949,...,27403,15508,4.0,11.281783,478682.39907,42301.686016,0.0,0.0,0.0,153173281.0
8761,2024-01-01 01:00:00,,test,0.02045,0.348704,1.867764,0.651296,6049393.0,11298841.0,0.23532,...,71176,19705,11.0,21.156934,900114.690178,42323.251942,0.0,0.0,0.0,153195963.0
8762,2024-01-01 02:00:00,,test,0.024063,0.493678,1.025613,0.506322,2691859.0,2760806.0,0.223918,...,44530,18683,7.0,12.622917,537278.322466,42287.437918,0.0,0.0,0.0,153221209.0
8763,2024-01-01 03:00:00,,test,0.027012,0.672534,0.486913,0.327466,11433572.0,5567153.0,0.21837,...,39508,9814,6.0,10.476673,445153.388402,42246.656269,0.0,0.0,0.0,153237929.0
8764,2024-01-01 04:00:00,,test,0.027208,0.154211,5.48464,0.845789,3393306.0,18611061.0,0.224068,...,51929,14968,8.0,14.438663,611220.212169,42301.357269,0.0,0.0,0.0,153261467.0


In [14]:
df = pd.concat([cleaned_train_df, cleaned_test_df], ignore_index=True)
df.tail()

Unnamed: 0,ID,target,_type,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,hourly_network-data_fees_fees_reward_percent,...,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_block-count_block_count,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usd_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations_usd,hourly_network-data_utxo-count_utxo_count
11547,2024-04-26 03:00:00,,test,0.01,0.530275,0.885815,0.469725,2581811.0,2287006.0,0.199738,...,18154,13601,3.0,2.339916,150728.680308,43614.146265,0.0,0.0,0.0,179820708.0
11548,2024-04-26 04:00:00,,test,0.01,0.618616,0.616512,0.381384,7870643.0,4852344.0,0.177466,...,31320,29096,3.5,4.045398,260219.961065,43601.260914,0.0,0.0,0.0,179833897.0
11549,2024-04-26 05:00:00,,test,0.01,0.511294,0.955823,0.488706,8684026.0,8300391.0,0.166744,...,34083,22094,3.0,3.752089,241282.824002,43614.302033,0.0,0.0,0.0,179851249.0
11550,2024-04-26 06:00:00,,test,0.01,0.475819,1.101638,0.524181,3425631.0,3773806.0,0.134465,...,26186,12668,3.0,1.941932,125150.157841,43614.929461,0.0,0.0,0.0,179852452.0
11551,2024-04-26 07:00:00,,test,0.01,0.424784,1.35414,0.575216,6854476.0,9281917.0,0.212873,...,18649,16054,3.0,2.535404,163293.843971,43612.269224,0.0,0.0,0.0,179851850.5


In [15]:
cleaned_train_df['close_diff2'] = cleaned_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'].shift(-1) - cleaned_train_df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close']
cleaned_test_df['close_diff2'] = np.nan

cleaned_train_df['close_diff2'].fillna(method='ffill', inplace=True)

df = pd.concat([cleaned_train_df, cleaned_test_df], ignore_index=True)
cleaned_train_df.tail()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_train_df['close_diff2'].fillna(method='ffill', inplace=True)
  cleaned_train_df['close_diff2'].fillna(method='ffill', inplace=True)


Unnamed: 0,ID,target,_type,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_sell_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_bybit_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_bybit_taker_buy_volume,hourly_network-data_fees_fees_reward_percent,...,hourly_network-data_addresses-count_addresses_count_receiver,hourly_network-data_block-count_block_count,hourly_network-data_fees_fees_total,hourly_network-data_fees_fees_total_usd,hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations,hourly_market-data_liquidations_htx_global_btc_usd_long_liquidations_usd,hourly_market-data_liquidations_htx_global_btc_usd_short_liquidations_usd,hourly_network-data_utxo-count_utxo_count,close_diff2
8755,2023-12-31 19:00:00,1.0,train,0.024847,0.604128,0.655279,0.395872,3032689.0,1987256.0,0.265839,...,23739,9.0,20.368081,868199.890699,42597.709522,0.0,0.0,0.0,153112663.0,-62.138735
8756,2023-12-31 20:00:00,1.0,train,0.023523,0.400193,1.498792,0.599807,4183763.0,6270592.0,0.254407,...,12934,4.0,8.53034,363206.281009,42535.570787,0.0,0.0,0.0,153128637.0,-38.047368
8757,2023-12-31 21:00:00,0.0,train,0.022368,0.560964,0.782644,0.439036,2905550.0,2274012.0,0.269433,...,11946,3.0,6.915014,294586.556191,42497.523419,0.0,0.0,0.0,153138595.0,-240.497201
8758,2023-12-31 22:00:00,2.0,train,0.021547,0.724267,0.380707,0.275733,28721507.0,10934468.0,0.296094,...,16113,2.0,5.258057,223202.409271,42257.026218,0.0,23800.0,0.0,153156171.0,28.773687
8759,2023-12-31 23:00:00,2.0,train,0.019143,0.511393,0.955444,0.488607,10112076.0,9661518.0,0.261773,...,12273,4.0,8.864901,374412.336322,42285.799905,0.0,0.0,0.0,153172052.0,28.773687


In [17]:
# hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close 열과 다른 열 간의 상관관계 계산
correlation_series = cleaned_train_df.drop(columns=['ID','_type','target']).corr(method='spearman')['close_diff2'].dropna()

# 상관관계 값을 기준으로 내림차순 정렬
sorted_correlation = correlation_series.sort_values(ascending=False)

# 상위 30개의 상관관계 출력
top_30_correlation = sorted_correlation.head(50)
print("양의 상관관계: ")
print(top_30_correlation)

# 하위 30개의 상관관계 출력
bottom_30_correlation = sorted_correlation.tail(30)
print("\n음의 상관관계: ")
print(bottom_30_correlation)

양의 상관관계: 
close_diff2                                                                      1.000000
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                 0.073059
hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio            0.070916
hourly_market-data_liquidations_binance_all_symbol_long_liquidations             0.051910
hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd         0.051883
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations               0.050856
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations_usd           0.050785
hourly_market-data_liquidations_binance_btc_usd_long_liquidations_usd            0.050373
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                0.050280
hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations        0.048547
hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd          0.048542


In [184]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_network-data_difficulty_difficulty" : "difficulty",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest" : "open_interest",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    #"hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    #"hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    #"hourly_network-data_addresses-count_addresses_count_active": "active_count",
    #"hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    #"hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close" : "close",
    "close_diff2" : "close_diffa",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 12)

In [186]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    #buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
    close_diff = df['close'].diff().fillna(0),
    volume = df["buy_volume"] + df["sell_volume"],
    open_diff = df["open_interest"].diff().fillna(0),
    SMA3 = df['close'].rolling(window=3).mean(),  # 3시간 이동 평균
    EMA3 = df['close'].ewm(span=3, adjust=False).mean(),
    SMA24 = df['close'].rolling(window=24).mean(),
    EMA24 = df['close'].ewm(span=24, adjust=False).mean()
)
# category, continuous 열을 따로 할당해둠
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type", "close_diffa"]] + [
    "liquidation_diff",
    "volume_diff",
    #"buy_sell_volume_ratio",
    "close_diff",
    "volume",
    "open_diff"
]


In [187]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 72, 12)]
)

In [188]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df[["target", "close_diffa"]]
df_filled = df.drop(columns=["target", "close_diffa"]).ffill().fillna(-999)

df = pd.concat([df_filled, _target], axis=1)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

LGBM

In [164]:
# train_test_split으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["close_diffa", "ID", "target"], axis=1), 
    train_df["close_diffa"], 
    test_size=0.2,
    random_state=42,
    shuffle=False
)

train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# LightGBM 파라미터 설정 (회귀 문제)
# params = {
#     "boosting_type": "gbdt",
#     "objective": "regression",
#     "metric": "rmse",
#     "num_leaves": 50,
#     "learning_rate": 0.1,
#     "n_estimators": 30,
#     "random_state": 42,
#     "verbose": 0,
# }

# 모델 훈련
lgb_model = lgb.train(
    params=study.best_params,
    train_set=train_data,
    valid_sets=valid_data,
)

# 검증 데이터에서 예측
y_valid_pred = lgb_model.predict(x_valid)
mse = mean_squared_error(y_valid, y_valid_pred)
print(f"Validation MSE: {mse}")
rmse = np.sqrt(mse)
print(f"Validation RMSE: {rmse}")
print(y_valid_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 104639
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 432
[LightGBM] [Info] Start training from score 1.726605
Validation MSE: 355774.81506408856
Validation RMSE: 596.468620351556
[   7.10845661   -4.22105863  -20.07119685 ... -427.81554156 -511.63114295
 -668.12178596]


In [165]:
# 전체 train_df로 모델 재훈련
full_train_data = lgb.Dataset(train_df.drop(["close_diffa", "ID","target"], axis=1), label=train_df["close_diffa"])
lgb_model_full = lgb.train(params=study.best_params, train_set=full_train_data)

# test_df에서 close_diff2 예측
x_test = test_df.drop(["close_diff2", "ID", "target"], axis=1)
predicted_close_diff2 = lgb_model_full.predict(x_test)

test_df.loc[:, 'close_diffa'] = predicted_close_diff2

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 104759
[LightGBM] [Info] Number of data points in the train set: 8760, number of used features: 432
[LightGBM] [Info] Start training from score 2.942674


RF

In [189]:
from sklearn.ensemble import RandomForestRegressor

# train_test_split으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["close_diffa", "ID", "target"], axis=1), 
    train_df["close_diffa"], 
    test_size=0.2,
    random_state=42,
    shuffle=False
)

model = RandomForestRegressor()
model.fit(x_train, y_train)

y_valid_pred = model.predict(x_valid)

mse = mean_squared_error(y_valid, y_valid_pred)
print("Mean Squared Error on Validation Set:", mse)

Mean Squared Error on Validation Set: 544847.6368637597


In [191]:
full_x_train = train_df.drop(["close_diffa", "ID", "target"], axis=1)
full_y_train = train_df["close_diffa"]
model = RandomForestRegressor()
model.fit(full_x_train, full_y_train)

full_x_test = test_df.drop(["close_diffa", "ID", "target"], axis=1)
predicted_close_diff2 = model.predict(full_x_test)

test_df.loc[:, 'close_diffa'] = predicted_close_diff2

In [192]:
test_df.tail()

Unnamed: 0,ID,funding_rates,difficulty,open_interest,long_liquidations,short_liquidations,buy_volume,sell_volume,close,liquidation_diff,...,volume49,volume61,open_diff1,open_diff13,open_diff25,open_diff37,open_diff49,open_diff61,target,close_diffa
11547,2024-04-26 03:00:00,0.009411,88104190000000.0,14868360000.0,0.71,0.2435,320161900.0,255319600.0,43497.277061,0.4665,...,354845800.0,1927152000.0,76780630.0,-60624330.0,38065130.0,-186834800.0,32483520.0,-31029890.0,,56.923784
11548,2024-04-26 04:00:00,0.009411,88104190000000.0,14879330000.0,6.577208,0.146,320161900.0,255319600.0,43504.103838,6.431208,...,226565300.0,937247300.0,-21937160.0,51337860.0,-74964710.0,17755000.0,26951180.0,-151852700.0,,59.672801
11549,2024-04-26 05:00:00,0.009411,88104190000000.0,14868360000.0,1.797163,5.21649,320161900.0,255319600.0,43492.61551,-3.419327,...,556783400.0,677764500.0,10968580.0,76831500.0,-3248390.0,40297880.0,48521000.0,37385960.0,,36.264647
11550,2024-04-26 06:00:00,0.009411,88104190000000.0,14868360000.0,0.803,1.656,320161900.0,255319600.0,43497.643337,-0.853,...,537939700.0,347386800.0,-10968580.0,-86875540.0,50895170.0,39114330.0,8381742.0,4496706.0,,30.068698
11551,2024-04-26 07:00:00,0.009411,88104190000000.0,14868360000.0,2.360383,3.930057,320161900.0,255319600.0,43519.565147,-1.569674,...,357725400.0,303094400.0,0.0,70578010.0,12921740.0,-207887700.0,9855959.0,22380740.0,,63.245314


In [193]:
# 등락률 계산 및 target 클래스 할당
def calculate_class(row):
    change_rate = (row['close_diffa'] / row['close']) * 100
    if change_rate < -0.5:
        return 0
    elif -0.5 <= change_rate < 0:
        return 1
    elif 0 <= change_rate < 0.5:
        return 2
    else:
        return 3

test_df['target'] = test_df.apply(calculate_class, axis=1)

target_counts = test_df['target'].value_counts()

# 결과 출력
print(target_counts)

target
2    1963
1     797
0      31
3       1
Name: count, dtype: int64


In [194]:
y_test_pred_class = test_df['target']
submission_df['target'] = y_test_pred_class.values
submission_df.to_csv("output.csv", index=False)