In [4]:
import os
import sys
from typing import List, Dict

import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Code 경로 추가
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
print(sys.path[-1])

c:\Users\com\Documents\GitHub\level1-classificationinmachinelearning-recsys-06


In [5]:
# 파일 호출
data_path: str = "../../data"
## raw.csv가 없는 경우 실행
# from Code.dataset.merge_all import merge_all
# df = merge_all(data_path)
df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "after_eda.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))  # ID, target 열만 가진 데이터 미리 호출


In [6]:
eda_df = df.drop(columns=['difference', 'percent'])

In [7]:
import yaml

config_path ='../../config-sample.yaml'

with open(config_path,'r') as file:
    config = yaml.safe_load(file)

In [8]:
eda_df['open_interest_diff'] = eda_df['open_interest'].diff(-1)
eda_df['open_interest_diff'].head()

0   -1.733962e+07
1    1.886794e+06
2    2.221901e+06
3   -7.007341e+06
4   -5.299980e+06
Name: open_interest_diff, dtype: float64

In [9]:
bar_df = eda_df.groupby("target").agg({
    "open_interest":"mean",
    "open_interest_diff": "mean",
}).reset_index()
bar_df

Unnamed: 0,target,open_interest,open_interest_diff
0,0.0,8810355000.0,82693060.0
1,1.0,8519933000.0,5123240.0
2,2.0,8500617000.0,-16468930.0
3,3.0,8698922000.0,-28529680.0


In [10]:
fig: go.Figure = make_subplots(
    rows=1,
    cols=2,
    shared_xaxes=True,
    subplot_titles=(
        "open_interest",
        "open_interest_diff",
    ),
)
fig.add_trace(go.Bar(x=bar_df["target"], y=bar_df["open_interest"]), row=1, col=1)
fig.add_trace(go.Bar(x=bar_df["target"], y=bar_df["open_interest_diff"]), row=1, col=2)
fig.update_xaxes(title_text="Target")
fig.update_layout(title_text="Target statistics", showlegend=False)
fig.show()

In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = eda_df.loc[eda_df["_type"] == "train"].drop(columns=["_type"])
test_df = eda_df.loc[eda_df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = config["xgboost"]
params["objective"] = "multi:softprob"  # 다중 클래스 분류를 위한 설정
params["num_class"] = len(y_train.unique())  # 클래스 개수
params["eval_metric"] = "mlogloss"  # 손실 함수 (멀티클래스 로지스틱 손실)

# 학습 시 평가 데이터로 검증 셋 사용
evals = [(dtrain, "train"), (dvalid, "eval")]

# XGBoost 모델 훈련
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,  # 트리의 개수
    evals=evals,
    early_stopping_rounds=10  # 조기 중단
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_classification_rep = classification_report(y_train, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)

[0]	train-mlogloss:1.35463	eval-mlogloss:1.36168
[1]	train-mlogloss:1.32399	eval-mlogloss:1.33817
[2]	train-mlogloss:1.30032	eval-mlogloss:1.32094
[3]	train-mlogloss:1.27307	eval-mlogloss:1.30038
[4]	train-mlogloss:1.24806	eval-mlogloss:1.28140
[5]	train-mlogloss:1.22456	eval-mlogloss:1.26377
[6]	train-mlogloss:1.20654	eval-mlogloss:1.25099
[7]	train-mlogloss:1.18435	eval-mlogloss:1.23509



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[8]	train-mlogloss:1.16370	eval-mlogloss:1.21995
[9]	train-mlogloss:1.14523	eval-mlogloss:1.20704
[10]	train-mlogloss:1.12647	eval-mlogloss:1.19381
[11]	train-mlogloss:1.10864	eval-mlogloss:1.18103
[12]	train-mlogloss:1.09398	eval-mlogloss:1.17164
[13]	train-mlogloss:1.07734	eval-mlogloss:1.16087
[14]	train-mlogloss:1.06159	eval-mlogloss:1.15021
[15]	train-mlogloss:1.04645	eval-mlogloss:1.13983
[16]	train-mlogloss:1.03312	eval-mlogloss:1.13154
[17]	train-mlogloss:1.01979	eval-mlogloss:1.12294
[18]	train-mlogloss:1.00737	eval-mlogloss:1.11515
[19]	train-mlogloss:0.99561	eval-mlogloss:1.10766
[20]	train-mlogloss:0.98409	eval-mlogloss:1.10108
[21]	train-mlogloss:0.97423	eval-mlogloss:1.09519
[22]	train-mlogloss:0.96287	eval-mlogloss:1.08827
[23]	train-mlogloss:0.95229	eval-mlogloss:1.08202
[24]	train-mlogloss:0.94146	eval-mlogloss:1.07567
[25]	train-mlogloss:0.93210	eval-mlogloss:1.07066
[26]	train-mlogloss:0.92366	eval-mlogloss:1.06693
[27]	train-mlogloss:0.91429	eval-mlogloss:1.06191
[2

In [12]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred


array([2, 1, 1, ..., 0, 0, 0], dtype=int64)

In [13]:
submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df

Unnamed: 0,ID,target
0,2024-01-01 00:00:00,2
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,1
3,2024-01-01 03:00:00,2
4,2024-01-01 04:00:00,1
...,...,...
2787,2024-04-26 03:00:00,0
2788,2024-04-26 04:00:00,0
2789,2024-04-26 05:00:00,0
2790,2024-04-26 06:00:00,0


In [14]:
submission_df['target'].value_counts()

target
2    1163
1    1108
0     282
3     239
Name: count, dtype: int64

In [28]:
submission_df.to_csv("XGBoost_after_eda_open_interest2.csv",index=False)

In [33]:
test = pd.read_csv('XGBoost_after_eda_open_interest.csv')
test['target'].value_counts()

target
2    1163
1    1108
0     282
3     239
Name: count, dtype: int64

In [27]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()