## Classificaion 모델 적용

### Library import

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

### Data Load

In [2]:
data_path: str = "../../data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path,"raw.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))
fe_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "fe_df.csv"))
train2: pd.DataFrame = pd.read_csv(os.path.join(data_path,"train2.csv"))

In [3]:
train_df = fe_df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = fe_df.loc[df["_type"]=="test"].drop(columns=["_type"])

# 특징과 타겟 변수 정의
features = [
    'scaled_log_hashrate', 'scaled_log_open_interest', 
    'coinbase_premium_index', 'scaled_log_taker_total_volume', 
    'scaled_log_taker_sell_volume', 'scaled_log_taker_buy_volume', 
    'moving_avg_scaled_log_taker_total_volume', 'funding_rates', 
    'scaled_liquidation_diff', 'scaled_log_total_liquidation_usd', 
    'long_liquidations', 'short_liquidations', 'weekday'
]

## model training(classificaion)

### XGBoost

In [3]:
# import xgboost as xgb

# X_train = train_df[features]
# y_train = train_df['target']
# X_test = test_df[features]

# # 데이터 분할 (훈련 데이터와 검증 데이터)
# X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # XGBoost DMatrix 생성
# train_data = xgb.DMatrix(X_train_split, label=y_train_split)
# val_data = xgb.DMatrix(X_val_split, label=y_val_split)
# test_data = xgb.DMatrix(X_test)

# # XGBoost 파라미터 설정
# params = {
#     'objective': 'multi:softprob',
#     'eval_metric': 'mlogloss',
#     'learning_rate': 0.05,
#     'num_class': 4,
#     'max_depth': 7,
#     'subsample': 0.9,
#     'colsample_bytree': 0.9
# }

# # 모델 훈련
# watchlist = [(train_data, 'train'), (val_data, 'eval')]
# model = xgb.train(params, train_data, evals=watchlist)

[0]	train-rmse:0.62749	eval-rmse:0.65286
[1]	train-rmse:0.62607	eval-rmse:0.65278
[2]	train-rmse:0.62427	eval-rmse:0.65310
[3]	train-rmse:0.62182	eval-rmse:0.65334
[4]	train-rmse:0.61956	eval-rmse:0.65303
[5]	train-rmse:0.61740	eval-rmse:0.65276
[6]	train-rmse:0.61511	eval-rmse:0.65279
[7]	train-rmse:0.61235	eval-rmse:0.65315
[8]	train-rmse:0.60967	eval-rmse:0.65296
[9]	train-rmse:0.60815	eval-rmse:0.65301


### Histogram-based Gradient Boosting

In [4]:
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import BorderlineSMOTE

model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=200,
    l2_regularization=0.01,
    early_stopping=True,
    random_state=42
)

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

# 데이터 분할 (훈련 데이터와 검증 데이터)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# BorderlineSMOTE 적용
smote = BorderlineSMOTE(random_state=42, sampling_strategy={0: 1000, 1: 3544, 2: 3671, 3: 2000})
X_train_split = X_train_split.fillna(X_train_split.mean())
X_train_split, y_train_split = smote.fit_resample(X_train_split, y_train_split)

model.fit(X_train_split, y_train_split)

#######################################################

In [5]:
from sklearn.preprocessing import label_binarize

# 검증 데이터에 대한 예측
val_preds = model.predict(X_val_split)
val_preds_class = np.round(val_preds)


# 정확도, F1 점수, ROC-AUC 점수 계산
accuracy = accuracy_score(y_val_split, val_preds_class)
f1 = f1_score(y_val_split, val_preds_class, average='weighted')
roc_auc = roc_auc_score(label_binarize(y_val_split, classes=[0, 1, 2, 3]), label_binarize(val_preds_class, classes=[0, 1, 2, 3]), average='weighted', multi_class='ovr')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")

Accuracy: 0.3926940639269406
F1 Score: 0.37719256631488884
ROC-AUC Score: 0.5146776311864492


In [28]:
# import plotly.express as px
# # Feature importance 평가
# importance = model.get_score(importance_type='weight')
# importance_df = pd.DataFrame({
#     'Feature': [k for k in importance.keys()],
#     'Importance': importance.values()
# }).sort_values(by='Importance', ascending=False)

# # 시각화
# fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
# fig.show()

AttributeError: 'HistGradientBoostingClassifier' object has no attribute 'get_score'

In [6]:
# 테스트 데이터에 대한 예측
X_test = X_test.fillna(X_test.mean())
test_preds = model.predict(X_test)

# 예측 결과를 submission_df에 추가
submission_df['target'] = test_preds.astype(int)
#submission_df.to_csv("fe_HistGradientBoostingClassifier_1.csv", index=False)

* fe_GradientBoostingRegressor_1은 model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=7,subsample=0.9, max_features=None, random_state=42)에 결측치 .mean()으로 대체
-> test 결과 0.3913
* fe_GradientBoostingRegressor_2는 동일한 조건에서 결측치 .interpolate(method='linear', limit_direction='forward', axis=0)로 대체
-> test 결과 0.3828
* fe_RandomForestRegressor_1은 model = RandomForestRegressor(n_estimators=300,max_depth=10,min_samples_split=2,min_samples_leaf=1,random_state=42,n_jobs=-1)
-> test 결과 0.3793
* fe_HistGradientBoostingClassifier_1은 model = HistGradientBoostingClassifier(learning_rate=0.05,max_iter=200,l2_regularization=0.01, early_stopping=True,random_state=42). BorderlineSMOTE(random_state=42, sampling_strategy={0: 1000, 1: 3544, 2: 3671, 3: 2000}). X_test.fillna(X_test.mean())
-> test 결과 0.3807

In [7]:
# test_preds의 통계값 계산 및 출력
test_preds_stats = {
    'mean': np.mean(test_preds),
    'std': np.std(test_preds),
    'min': np.min(test_preds),
    '25%': np.percentile(test_preds, 25),
    '50%': np.median(test_preds),
    '75%': np.percentile(test_preds, 75),
    'max': np.max(test_preds)
}

print(test_preds_stats)

{'mean': 1.5601719197707737, 'std': 0.6276494323945013, 'min': 0.0, '25%': 1.0, '50%': 2.0, '75%': 2.0, 'max': 3.0}


In [12]:
# fe_df의 percent의 통계값 계산 및 출력
fe_df_percent_stats = {
    'mean': np.mean(fe_df['percent']),
    'std': np.std(fe_df['percent']),
    'min': np.min(fe_df['percent']),
    '25%': np.percentile(fe_df['percent'], 25),
    '50%': np.median(fe_df['percent']),
    '75%': np.percentile(fe_df['percent'], 75),
    'max': np.max(fe_df['percent'])
}

print(fe_df_percent_stats)

{'mean': 0.00965947393359122, 'std': 0.5533592318686765, 'min': -6.213530715569782, '25%': nan, '50%': nan, '75%': nan, 'max': 5.624341632125107}


In [8]:
import plotly.express as px

# submission_df['target']의 분포 시각화
fig = px.histogram(submission_df, x='target', title='Distribution of Target in Submission Data')
fig.show()
fig = px.histogram(fe_df, x='target', title='Distribution of Target in original Data')
fig.show()