## Regression 모델 적용

### Library import

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

### Data Load

In [2]:
data_path: str = "../../data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path,"raw.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))
fe_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "fe_df.csv"))
train2: pd.DataFrame = pd.read_csv(os.path.join(data_path,"train2.csv"))

In [3]:
train_df = fe_df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = fe_df.loc[df["_type"]=="test"].drop(columns=["_type"])

# 특징과 타겟 변수 정의
features = [
    'scaled_log_hashrate', 'scaled_log_open_interest', 
    'coinbase_premium_index', 'scaled_log_taker_total_volume', 
    'scaled_log_taker_sell_volume', 'scaled_log_taker_buy_volume', 
    'moving_avg_scaled_log_taker_total_volume', 'funding_rates', 
    'scaled_liquidation_diff', 'scaled_log_total_liquidation_usd', 
    'long_liquidations', 'short_liquidations', 'weekday'
]

## model training(Regression)

### XGBoost regression

In [3]:
# import xgboost as xgb

# X_train = train_df[features]
# y_train = train_df['percent']
# X_test = test_df[features]

# # 데이터 분할 (훈련 데이터와 검증 데이터)
# X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # XGBoost DMatrix 생성
# train_data = xgb.DMatrix(X_train_split, label=y_train_split)
# val_data = xgb.DMatrix(X_val_split, label=y_val_split)
# test_data = xgb.DMatrix(X_test)

# # XGBoost 파라미터 설정
# params = {
#     'objective': 'reg:squarederror',
#     'eval_metric': 'rmse',
#     'learning_rate': 0.05,
#     'max_depth': 7,
#     'subsample': 0.9,
#     'colsample_bytree': 0.9
# }

# # 모델 훈련
# watchlist = [(train_data, 'train'), (val_data, 'eval')]
# model = xgb.train(params, train_data, evals=watchlist)


[0]	train-rmse:0.62749	eval-rmse:0.65286
[1]	train-rmse:0.62607	eval-rmse:0.65278
[2]	train-rmse:0.62427	eval-rmse:0.65310
[3]	train-rmse:0.62182	eval-rmse:0.65334
[4]	train-rmse:0.61956	eval-rmse:0.65303
[5]	train-rmse:0.61740	eval-rmse:0.65276
[6]	train-rmse:0.61511	eval-rmse:0.65279
[7]	train-rmse:0.61235	eval-rmse:0.65315
[8]	train-rmse:0.60967	eval-rmse:0.65296
[9]	train-rmse:0.60815	eval-rmse:0.65301


### GradientBoostingRegressor

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

X_train = train_df[features]
X_train = X_train.interpolate(method='linear', limit_direction='forward', axis=0)
y_train = train_df['percent']
X_test = test_df[features]

# 데이터 분할 (훈련 데이터와 검증 데이터)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(
    n_estimators=500, 
    learning_rate=0.01, 
    max_depth=7,
    subsample=0.9,
    max_features=None,
    random_state=42
)

model.fit(X_train_split, y_train_split)




### RandomForestRegressor

In [4]:
# from sklearn.ensemble import RandomForestRegressor


# model = RandomForestRegressor(
#     n_estimators=300,
#     max_depth=10,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     random_state=42,
#     n_jobs=-1
# )

# X_train = train_df[features]
# y_train = train_df['percent']
# X_test = test_df[features]

# # 데이터 분할 (훈련 데이터와 검증 데이터)
# X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# model.fit(X_train_split, y_train_split)


### Histogram-based Gradient Boosting

In [17]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_iter=200,
    l2_regularization=0.01,
    early_stopping=True,
    random_state=42
)


X_train = train_df[features]
y_train = train_df['percent']
X_test = test_df[features]

# 데이터 분할 (훈련 데이터와 검증 데이터)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_split)

#######################################################

In [5]:
def regression_to_class(preds):
    """
    regression 값을 클래스로 변환하는 함수.

    """
    classes = np.zeros_like(preds)
    classes[preds >= 0.5] = 3
    classes[(preds >= 0) & (preds < 0.5)] = 2
    classes[(preds >= -0.5) & (preds < 0)] = 1
    classes[preds < -0.5] = 0
    return classes

In [18]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import mean_squared_error

# 검증 데이터에 대한 예측
val_preds = model.predict(X_val_split)
# MSE 계산

mse = mean_squared_error(y_val_split, val_preds)

# 회귀 예측값을 클래스 값으로 변환
val_preds_class = regression_to_class(val_preds)

# 실제 클래스 값
y_val_class = regression_to_class(y_val_split.values)

# 정확도, F1 점수, ROC-AUC 점수 계산
accuracy = accuracy_score(y_val_class, val_preds_class)
f1 = f1_score(y_val_class, val_preds_class, average='weighted')
roc_auc = roc_auc_score(label_binarize(y_val_class, classes=[0, 1, 2, 3]), label_binarize(val_preds_class, classes=[0, 1, 2, 3]), average='weighted', multi_class='ovr')

print(f"MSE: {mse}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

MSE: 0.426136868914292
Accuracy: 0.4115296803652968
F1 Score: 0.34011428493215906
ROC-AUC Score: 0.49635373613754


In [7]:
# import plotly.express as px
# # Feature importance 평가
# importance = model.get_score(importance_type='weight')
# importance_df = pd.DataFrame({
#     'Feature': [k for k in importance.keys()],
#     'Importance': importance.values()
# }).sort_values(by='Importance', ascending=False)

# # 시각화
# fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
# fig.show()

AttributeError: 'RandomForestRegressor' object has no attribute 'get_score'

In [20]:
# 테스트 데이터에 대한 예측
#X_test = X_test.interpolate(method='linear', limit_direction='forward', axis=0)
test_preds = model.predict(X_test)
test_preds_class = regression_to_class(test_preds)

# 예측 결과를 submission_df에 추가
submission_df['target'] = test_preds_class.astype(int)
#submission_df.to_csv("fe_GradientBoostingRegressor_2.csv", index=False)

* fe_GradientBoostingRegressor_1은 model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=7,subsample=0.9, max_features=None, random_state=42)에 결측치 .mean()으로 대체
-> test 결과 0.3913
* fe_GradientBoostingRegressor_2는 동일한 조건에서 결측치 .interpolate(method='linear', limit_direction='forward', axis=0)로 대체
-> test 결과 0.3828
* fe_RandomForestRegressor_1은 model = RandomForestRegressor(n_estimators=300,max_depth=10,min_samples_split=2,min_samples_leaf=1,random_state=42,n_jobs=-1)
-> test결과 0.3793

In [21]:
# test_preds의 통계값 계산 및 출력
test_preds_stats = {
    'mean': np.mean(test_preds),
    'std': np.std(test_preds),
    'min': np.min(test_preds),
    '25%': np.percentile(test_preds, 25),
    '50%': np.median(test_preds),
    '75%': np.percentile(test_preds, 75),
    'max': np.max(test_preds)
}

print(test_preds_stats)

{'mean': -0.003947446958475869, 'std': 0.0695234990557819, 'min': -0.32764197021272096, '25%': -0.029555669358876272, '50%': 0.00769906211920716, '75%': 0.021257690478897962, 'max': 0.25828378738061675}


In [9]:
# fe_df의 percent의 통계값 계산 및 출력
fe_df_percent_stats = {
    'mean': np.mean(fe_df['percent']),
    'std': np.std(fe_df['percent']),
    'min': np.min(fe_df['percent']),
    '25%': np.percentile(fe_df['percent'], 25),
    '50%': np.median(fe_df['percent']),
    '75%': np.percentile(fe_df['percent'], 75),
    'max': np.max(fe_df['percent'])
}

print(fe_df_percent_stats)

{'mean': 0.00965947393359122, 'std': 0.5533592318686765, 'min': -6.213530715569782, '25%': nan, '50%': nan, '75%': nan, 'max': 5.624341632125107}


In [22]:
import plotly.express as px

# submission_df['target']의 분포 시각화
fig = px.histogram(submission_df, x='target', title='Distribution of Target in Submission Data')
fig.show()