In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Loading Preprocessed Data

In [2]:
# 파일 호출

data_path: str =  os.path.join(os.path.dirname(os.path.abspath('')), 'data', 'preprocessed')
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train_df.csv"), index_col=0)
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test_df.csv"), index_col=0)
all_df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [3]:
train_X = train_df.drop(columns=['target', 'close_price'])
train_y = train_df['target']
test_X = test_df.drop(columns=['target', 'close_price'])

In [4]:
project_path: str = os.path.dirname(os.path.abspath(''))
model_path: str = os.path.join(project_path, 'models')
if project_path not in sys.path:
    sys.path.append(project_path)
if model_path not in sys.path:
    sys.path.append(model_path)
from models.model import Model
from models.binary_ensemble_model import BinaryEnsembleModel

## Split into Train/Validation data

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=42, stratify=train_y)
model_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',  # 이진 분류로 설정
    'metric': 'binary_logloss',  # 이진 클래스 로그 손실
    'learning_rate': 0.005,
    'num_leaves': 31,
    'max_depth': -1,
    'min_data_in_leaf': 50,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'random_state': 42,
}

ensemble_model = BinaryEnsembleModel(model_params = model_params, selected_features = 'all')

# Model을 training data에 대해 학습합니다.
ensemble_model.fit(X_train, y_train,None)

# Validation set에 대한 예측
y_valid_pred = ensemble_model.predict(X_valid)

[LightGBM] [Info] Number of positive: 592, number of negative: 6416
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 296107
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 1207
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.084475 -> initscore=-2.383044
[LightGBM] [Info] Start training from score -2.383044
[LightGBM] [Info] Number of positive: 2835, number of negative: 4173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 296107
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 1207
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.404538 -> initscore=-0.386593
[LightGBM] [Info] Start training from score -0.386593
[LightGBM] 

In [6]:
# Calculate accuracy
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Classification report for precision, recall, F1-score
print("Classification Report:")
print(classification_report(y_valid, y_valid_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_valid_pred))

Validation Accuracy: 0.4795
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.09      0.17       148
         1.0       0.48      0.51      0.49       709
         2.0       0.48      0.61      0.53       734
         3.0       0.50      0.09      0.15       161

    accuracy                           0.48      1752
   macro avg       0.56      0.33      0.34      1752
weighted avg       0.50      0.48      0.45      1752

Confusion Matrix:
[[ 14  60  72   2]
 [  2 365 338   4]
 [  1 278 447   8]
 [  1  64  82  14]]


## 결과 분석

위의 결과에 따른 confusion matrix, precision, recall 을 참조하면 binary ensemble 모델은 여전히 class 불균형을 해결하지 못했음을 알 수 있습니다. 하지만, precision 자체는 0 과 3에서 꽤 준수한 성능을 보입니다. 1과 2 중에서는 2, 즉 실제 상승세일 때 모델이 상승세로 예측하는 비율이 상승했습니다. 하지만, 값이 내려갈 때 모델이 잘 예측하지 못함을 알 수 있습니다. validation accuracy 는 0.4795로 다른 모델에 비해 준수한 성능을 보이지만 내재된 문제들은 여전히 지속됨을 알 수 있습니다 