In [1]:
import os
import sys
from typing import List, Dict

import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Code 경로 추가
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
print(sys.path[-1])

c:\Users\com\Documents\GitHub\level1-classificationinmachinelearning-recsys-06


In [2]:
# 파일 호출
data_path: str = "../../data"
## raw.csv가 없는 경우 실행
# from Code.dataset.merge_all import merge_all
# df = merge_all(data_path)
df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "scaled_eda2.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))  # ID, target 열만 가진 데이터 미리 호출


In [3]:
import yaml

config_path ='../../config-sample.yaml'

with open(config_path,'r') as file:
    config = yaml.safe_load(file)

### 데이터 증강

In [4]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, SVMSMOTE

def get_over_sampler(x_train, y_train, strategy='ADASYN', sampling_strategy='auto'):
    if strategy == 'SMOTE':
        sampler = SMOTE(sampling_strategy=sampling_strategy)
    elif strategy == 'BorderlineSMOTE':
        sampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)
    elif strategy == 'ADASYN':
        sampler = ADASYN(sampling_strategy=sampling_strategy)
    elif strategy == 'SVMSMOTE':
        sampler = SVMSMOTE(sampling_strategy=sampling_strategy)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    x_resampled, y_resampled = sampler.fit_resample(x_train, y_train)
    return x_resampled, y_resampled

In [5]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = df.loc[df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy,random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = config["xgboost"]
params["objective"] = "multi:softprob"  # 다중 클래스 분류를 위한 설정
params["num_class"] = len(y_train_resampled.unique())  # 클래스 개수
params["eval_metric"] = "mlogloss"  # 손실 함수 (멀티클래스 로지스틱 손실)
params['max_depth'] = 7
params['learning_rate'] = 0.05
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8

# 학습 시 평가 데이터로 검증 셋 사용
evals = [(dtrain, "train"), (dvalid, "eval")]

# XGBoost 모델 훈련
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,  # 트리의 개수
    evals=evals,
    early_stopping_rounds=10,  # 조기 중단
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)

[0]	train-mlogloss:1.36861	eval-mlogloss:1.37251
[1]	train-mlogloss:1.35180	eval-mlogloss:1.36009
[2]	train-mlogloss:1.33585	eval-mlogloss:1.34759
[3]	train-mlogloss:1.31973	eval-mlogloss:1.33592
[4]	train-mlogloss:1.30502	eval-mlogloss:1.32466


Parameters: { "early_stopping_rounds", "verbose" } are not used.



[5]	train-mlogloss:1.29096	eval-mlogloss:1.31506
[6]	train-mlogloss:1.27784	eval-mlogloss:1.30595
[7]	train-mlogloss:1.26435	eval-mlogloss:1.29753
[8]	train-mlogloss:1.25164	eval-mlogloss:1.28949
[9]	train-mlogloss:1.23899	eval-mlogloss:1.28150
[10]	train-mlogloss:1.22753	eval-mlogloss:1.27387
[11]	train-mlogloss:1.21586	eval-mlogloss:1.26633
[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956
[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	

In [6]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

array([1, 1, 2, ..., 1, 1, 1], dtype=int64)

In [7]:
submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1819
1     839
0      83
3      51
Name: count, dtype: int64

In [9]:
submission_df.to_csv("XGBoost_after_eda_scaled_smote_2.csv",index=False)

In [8]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

---------------------------------------------------------------------

In [10]:
# smote + cv
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = df.loc[df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류
    "num_class": len(y_train_resampled.unique()),  # 클래스 개수
    "eval_metric": "mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    "max_depth": 7,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# 교차 검증 설정 (5-fold)
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,
    nfold=5,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)

# 최적의 부스팅 라운드 수 확인
best_num_boost_rounds = len(cv_results)

# 교차 검증 결과 출력
print("최적 부스팅 라운드 수:", best_num_boost_rounds)
print(cv_results.tail())

# 최적의 부스팅 라운드로 모델 학습
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_rounds,
    evals=[(dtrain, "train"), (dvalid, "eval")],
    early_stopping_rounds=10
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


최적 부스팅 라운드 수: 200
     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
195             0.569713            0.008407            1.085083   
196             0.568109            0.008350            1.084817   
197             0.566234            0.007949            1.084401   
198             0.564624            0.007831            1.084133   
199             0.562921            0.007945            1.084049   

     test-mlogloss-std  
195           0.018169  
196           0.018254  
197           0.018472  
198           0.018644  
199           0.018703  
[0]	train-mlogloss:1.36950	eval-mlogloss:1.37238
[1]	train-mlogloss:1.35238	eval-mlogloss:1.35939
[2]	train-mlogloss:1.33640	eval-mlogloss:1.34713
[3]	train-mlogloss:1.32020	eval-mlogloss:1.33552
[4]	train-mlogloss:1.30483	eval-mlogloss:1.32431
[5]	train-mlogloss:1.29064	eval-mlogloss:1.31503
[6]	train-mlogloss:1.27667	eval-mlogloss:1.30519
[7]	train-mlogloss:1.26347	eval-mlogloss:1.29666
[8]	train-mlogloss:1.25018	eval

In [119]:
# open_interest_diff drop
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# TimeSeriesSplit 객체 생성
tscv = TimeSeriesSplit(n_splits=6)  # 5개로 데이터를 나눠 교차 검증

# 모델 성능을 저장할 리스트
val_accuracies = []
f1_macros = []
f1_weighteds = []
roc_aucs = []

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = df.loc[df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# TimeSeriesSplit을 이용한 교차 검증
for train_index, val_index in tscv.split(x_train):
    X_train_split, X_val_split = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_split, y_val_split = y_train.iloc[train_index], y_train.iloc[val_index]


    sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
    smote = SMOTE(sampling_strategy=sampling_strategy,random_state=42)
    x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

    # XGBoost 모델을 위한 DMatrix로 변환
    dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
    dvalid = xgb.DMatrix(x_valid, label=y_valid)

    # XGBoost 모델 학습 파라미터
    params = config["xgboost"]
    params["objective"] = "multi:softprob"  # 다중 클래스 분류를 위한 설정
    params["num_class"] = len(y_train_resampled.unique())  # 클래스 개수
    params["eval_metric"] = "mlogloss"  # 손실 함수 (멀티클래스 로지스틱 손실)
    params['max_depth'] = 7
    params['learning_rate'] = 0.05
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8

    # 학습 시 평가 데이터로 검증 셋 사용
    evals = [(dtrain, "train"), (dvalid, "eval")]

    # XGBoost 모델 훈련
    xgb_model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=200,  # 트리의 개수
        evals=evals,
        early_stopping_rounds=10,  # 조기 중단
    )

    # 예측 (훈련 데이터셋)
    y_train_pred_prob = xgb_model.predict(dtrain)
    y_train_pred = y_train_pred_prob.argmax(axis=1)

    # 예측 (검증 데이터셋)
    y_valid_pred_prob = xgb_model.predict(dvalid)
    y_valid_pred = y_valid_pred_prob.argmax(axis=1)

    # 성능 평가 (훈련 데이터셋)
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    train_classification_rep = classification_report(y_train_resampled, y_train_pred)

    # 성능 평가 (검증 데이터셋)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    val_accuracies.append(valid_accuracy)
    valid_classification_rep = classification_report(y_valid, y_valid_pred)

    # F1 Score 계산 (Macro와 Weighted)
    f1_macro = f1_score(y_valid, y_valid_pred, average='macro')
    f1_weighted = f1_score(y_valid, y_valid_pred, average='weighted')
    f1_macros.append(f1_macro)
    f1_weighteds.append(f1_weighted)

    # ROC-AUC 계산
    y_val_binarized = label_binarize(y_valid, classes=[0, 1, 2, 3])
    roc_auc = roc_auc_score(y_val_binarized, y_valid_pred_prob, average='macro', multi_class='ovr')
    roc_aucs.append(roc_auc)

'''
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)
'''

# 각 fold에 대한 평균 성능 출력
print(f"Average Validation Accuracy: {np.mean(val_accuracies):.4f}")
print(f"Average F1 Score (Macro): {np.mean(f1_macros):.4f}")
print(f"Average F1 Score (Weighted): {np.mean(f1_weighteds):.4f}")
print(f"Average ROC-AUC (Macro): {np.mean(roc_aucs):.4f}")

[0]	train-mlogloss:1.36861	eval-mlogloss:1.37251
[1]	train-mlogloss:1.35180	eval-mlogloss:1.36009
[2]	train-mlogloss:1.33585	eval-mlogloss:1.34759
[3]	train-mlogloss:1.31973	eval-mlogloss:1.33592
[4]	train-mlogloss:1.30502	eval-mlogloss:1.32466
[5]	train-mlogloss:1.29096	eval-mlogloss:1.31506
[6]	train-mlogloss:1.27784	eval-mlogloss:1.30595
[7]	train-mlogloss:1.26435	eval-mlogloss:1.29753
[8]	train-mlogloss:1.25164	eval-mlogloss:1.28949
[9]	train-mlogloss:1.23899	eval-mlogloss:1.28150
[10]	train-mlogloss:1.22753	eval-mlogloss:1.27387
[11]	train-mlogloss:1.21586	eval-mlogloss:1.26633
[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	train-mlogloss:1.09119	eval-mlogloss:1.19780
[26]	train-mlogloss:1.08425	eval-mlogloss:1.19465
[27]	train-mlogloss:1.07697	eval-mlogloss:1.19138
[28]	train-mlogloss:1.07004	eval-mlogloss:1.18871
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18575
[30]	train-mlogloss:1.05717	eval-mlogloss:1.18327
[31]	train-mlogloss:1.05115	eval-mlogloss:1.18076
[32]	train-mlogloss:1.04554	eval-mlogloss:1.17772



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[10]	train-mlogloss:1.22753	eval-mlogloss:1.27387
[11]	train-mlogloss:1.21586	eval-mlogloss:1.26633
[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956
[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	train-mlogloss:1.09119	eval-mlogloss:1.19780
[26]	train-mlogloss:1.08425	eval-mlogloss:1.19465
[27]	train-mlogloss:1.07697	eval-mlogloss:1.19138
[28]	train-mlogloss:1.07004	eval-mlogloss:1.18871
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18575



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	train-mlogloss:1.09119	eval-mlogloss:1.19780
[26]	train-mlogloss:1.08425	eval-mlogloss:1.19465
[27]	train-mlogloss:1.07697	eval-mlogloss:1.19138
[28]	train-mlogloss:1.07004	eval-mlogloss:1.18871
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18575
[30]	train-mlogloss:1.05717	eval-mlogloss:1.18327
[31]	train-mlogloss:1.05115	eval-mlogloss:1.18076
[32]	train-mlogloss:1.04554	eval-mlogloss:1.17772
[33]	train-mlogloss:1.03927	eval-mlogloss:1.17532



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[11]	train-mlogloss:1.21586	eval-mlogloss:1.26633
[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956
[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	train-mlogloss:1.09119	eval-mlogloss:1.19780
[26]	train-mlogloss:1.08425	eval-mlogloss:1.19465
[27]	train-mlogloss:1.07697	eval-mlogloss:1.19138
[28]	train-mlogloss:1.07004	eval-mlogloss:1.18871
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18575
[30]	train-mlogloss:1.05717	eval-mlogloss:1.18327



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956
[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	train-mlogloss:1.09119	eval-mlogloss:1.19780
[26]	train-mlogloss:1.08425	eval-mlogloss:1.19465
[27]	train-mlogloss:1.07697	eval-mlogloss:1.19138
[28]	train-mlogloss:1.07004	eval-mlogloss:1.18871
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18575
[30]	train-mlogloss:1.05717	eval-mlogloss:1.18327
[31]	train-mlogloss:1.05115	eval-mlogloss:1.18076



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[5]	train-mlogloss:1.29096	eval-mlogloss:1.31506
[6]	train-mlogloss:1.27784	eval-mlogloss:1.30595
[7]	train-mlogloss:1.26435	eval-mlogloss:1.29753
[8]	train-mlogloss:1.25164	eval-mlogloss:1.28949
[9]	train-mlogloss:1.23899	eval-mlogloss:1.28150
[10]	train-mlogloss:1.22753	eval-mlogloss:1.27387
[11]	train-mlogloss:1.21586	eval-mlogloss:1.26633
[12]	train-mlogloss:1.20540	eval-mlogloss:1.25956
[13]	train-mlogloss:1.19487	eval-mlogloss:1.25306
[14]	train-mlogloss:1.18475	eval-mlogloss:1.24706
[15]	train-mlogloss:1.17457	eval-mlogloss:1.24046
[16]	train-mlogloss:1.16528	eval-mlogloss:1.23543
[17]	train-mlogloss:1.15601	eval-mlogloss:1.22986
[18]	train-mlogloss:1.14735	eval-mlogloss:1.22514
[19]	train-mlogloss:1.13876	eval-mlogloss:1.22060
[20]	train-mlogloss:1.13029	eval-mlogloss:1.21648
[21]	train-mlogloss:1.12147	eval-mlogloss:1.21239
[22]	train-mlogloss:1.11387	eval-mlogloss:1.20862
[23]	train-mlogloss:1.10606	eval-mlogloss:1.20487
[24]	train-mlogloss:1.09852	eval-mlogloss:1.20127
[25]	

In [123]:
features = pd.read_csv('add_new_features.csv')
features['market_pressure']

0        0.904774
1        1.655721
2        1.027512
3        0.874477
4        0.966796
           ...   
11547         NaN
11548         NaN
11549         NaN
11550         NaN
11551         NaN
Name: market_pressure, Length: 11552, dtype: float64

In [124]:
concat_df = pd.concat([df, features['market_pressure']], axis=1)
concat_df.columns

Index(['ID', '_type', 'target', 'scaled_log_hashrate',
       'scaled_log_open_interest', 'scaled_log_coinbase_premium_index',
       'scaled_funding_rates', 'scaled_estimated_block_reward',
       'scaled_liquidation_diff', 'scaled_log_total_liquidation',
       'scaled_log_total_taker_volume', 'scaled_utxo_count',
       'scaled_total_transactions_count', 'taker_buy_sell_ratio',
       'moving_avg_scaled_log_total_volume', 'open_interest_diff',
       'market_pressure'],
      dtype='object')

In [120]:
# open_interest_diff drop
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# TimeSeriesSplit 객체 생성
tscv = TimeSeriesSplit(n_splits=6)  # 5개로 데이터를 나눠 교차 검증

# 모델 성능을 저장할 리스트
val_accuracies = []
f1_macros = []
f1_weighteds = []
roc_aucs = []

# _type에 따라 train, test 분리
train_df = concat_df.loc[concat_df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = concat_df.loc[concat_df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# TimeSeriesSplit을 이용한 교차 검증
for train_index, val_index in tscv.split(x_train):
    X_train_split, X_val_split = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_split, y_val_split = y_train.iloc[train_index], y_train.iloc[val_index]


    sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
    smote = SMOTE(sampling_strategy=sampling_strategy,random_state=42)
    x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

    # XGBoost 모델을 위한 DMatrix로 변환
    dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
    dvalid = xgb.DMatrix(x_valid, label=y_valid)

    # XGBoost 모델 학습 파라미터
    params = config["xgboost"]
    params["objective"] = "multi:softprob"  # 다중 클래스 분류를 위한 설정
    params["num_class"] = len(y_train_resampled.unique())  # 클래스 개수
    params["eval_metric"] = "mlogloss"  # 손실 함수 (멀티클래스 로지스틱 손실)
    params['max_depth'] = 7
    params['learning_rate'] = 0.05
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8

    # 학습 시 평가 데이터로 검증 셋 사용
    evals = [(dtrain, "train"), (dvalid, "eval")]

    # XGBoost 모델 훈련
    xgb_model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=200,  # 트리의 개수
        evals=evals,
        early_stopping_rounds=10,  # 조기 중단
    )

    # 예측 (훈련 데이터셋)
    y_train_pred_prob = xgb_model.predict(dtrain)
    y_train_pred = y_train_pred_prob.argmax(axis=1)

    # 예측 (검증 데이터셋)
    y_valid_pred_prob = xgb_model.predict(dvalid)
    y_valid_pred = y_valid_pred_prob.argmax(axis=1)

    # 성능 평가 (훈련 데이터셋)
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    train_classification_rep = classification_report(y_train_resampled, y_train_pred)

    # 성능 평가 (검증 데이터셋)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    val_accuracies.append(valid_accuracy)
    valid_classification_rep = classification_report(y_valid, y_valid_pred)

    # F1 Score 계산 (Macro와 Weighted)
    f1_macro = f1_score(y_valid, y_valid_pred, average='macro')
    f1_weighted = f1_score(y_valid, y_valid_pred, average='weighted')
    f1_macros.append(f1_macro)
    f1_weighteds.append(f1_weighted)

    # ROC-AUC 계산
    y_val_binarized = label_binarize(y_valid, classes=[0, 1, 2, 3])
    roc_auc = roc_auc_score(y_val_binarized, y_valid_pred_prob, average='macro', multi_class='ovr')
    roc_aucs.append(roc_auc)

'''
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)
'''

# 각 fold에 대한 평균 성능 출력
print(f"Average Validation Accuracy: {np.mean(val_accuracies):.4f}")
print(f"Average F1 Score (Macro): {np.mean(f1_macros):.4f}")
print(f"Average F1 Score (Weighted): {np.mean(f1_weighteds):.4f}")
print(f"Average ROC-AUC (Macro): {np.mean(roc_aucs):.4f}")

[0]	train-mlogloss:1.36846	eval-mlogloss:1.37249
[1]	train-mlogloss:1.35196	eval-mlogloss:1.35991
[2]	train-mlogloss:1.33538	eval-mlogloss:1.34820
[3]	train-mlogloss:1.31976	eval-mlogloss:1.33691
[4]	train-mlogloss:1.30475	eval-mlogloss:1.32594
[5]	train-mlogloss:1.29038	eval-mlogloss:1.31575
[6]	train-mlogloss:1.27659	eval-mlogloss:1.30684



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[7]	train-mlogloss:1.26318	eval-mlogloss:1.29792
[8]	train-mlogloss:1.25050	eval-mlogloss:1.28979
[9]	train-mlogloss:1.23863	eval-mlogloss:1.28161
[10]	train-mlogloss:1.22697	eval-mlogloss:1.27419
[11]	train-mlogloss:1.21569	eval-mlogloss:1.26718
[12]	train-mlogloss:1.20488	eval-mlogloss:1.26033
[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	train-mlogloss:1.09165	eval-mlogloss:1.19773
[26]	train-mlogloss:1.08443	eval-mlogloss:1.19429
[27


Parameters: { "early_stopping_rounds", "verbose" } are not used.




[5]	train-mlogloss:1.29038	eval-mlogloss:1.31575
[6]	train-mlogloss:1.27659	eval-mlogloss:1.30684
[7]	train-mlogloss:1.26318	eval-mlogloss:1.29792
[8]	train-mlogloss:1.25050	eval-mlogloss:1.28979
[9]	train-mlogloss:1.23863	eval-mlogloss:1.28161
[10]	train-mlogloss:1.22697	eval-mlogloss:1.27419
[11]	train-mlogloss:1.21569	eval-mlogloss:1.26718
[12]	train-mlogloss:1.20488	eval-mlogloss:1.26033
[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	


Parameters: { "early_stopping_rounds", "verbose" } are not used.




[5]	train-mlogloss:1.29038	eval-mlogloss:1.31575
[6]	train-mlogloss:1.27659	eval-mlogloss:1.30684
[7]	train-mlogloss:1.26318	eval-mlogloss:1.29792
[8]	train-mlogloss:1.25050	eval-mlogloss:1.28979
[9]	train-mlogloss:1.23863	eval-mlogloss:1.28161
[10]	train-mlogloss:1.22697	eval-mlogloss:1.27419
[11]	train-mlogloss:1.21569	eval-mlogloss:1.26718
[12]	train-mlogloss:1.20488	eval-mlogloss:1.26033
[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	


Parameters: { "early_stopping_rounds", "verbose" } are not used.




[10]	train-mlogloss:1.22697	eval-mlogloss:1.27419
[11]	train-mlogloss:1.21569	eval-mlogloss:1.26718
[12]	train-mlogloss:1.20488	eval-mlogloss:1.26033
[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	train-mlogloss:1.09165	eval-mlogloss:1.19773
[26]	train-mlogloss:1.08443	eval-mlogloss:1.19429
[27]	train-mlogloss:1.07670	eval-mlogloss:1.19083
[28]	train-mlogloss:1.06993	eval-mlogloss:1.18811
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18502



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	train-mlogloss:1.09165	eval-mlogloss:1.19773
[26]	train-mlogloss:1.08443	eval-mlogloss:1.19429
[27]	train-mlogloss:1.07670	eval-mlogloss:1.19083
[28]	train-mlogloss:1.06993	eval-mlogloss:1.18811
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18502
[30]	train-mlogloss:1.05835	eval-mlogloss:1.18271
[31]	train-mlogloss:1.05230	eval-mlogloss:1.18003
[32]	train-mlogloss:1.04671	eval-mlogloss:1.17775



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[12]	train-mlogloss:1.20488	eval-mlogloss:1.26033
[13]	train-mlogloss:1.19406	eval-mlogloss:1.25394
[14]	train-mlogloss:1.18395	eval-mlogloss:1.24820
[15]	train-mlogloss:1.17411	eval-mlogloss:1.24236
[16]	train-mlogloss:1.16479	eval-mlogloss:1.23710
[17]	train-mlogloss:1.15581	eval-mlogloss:1.23143
[18]	train-mlogloss:1.14724	eval-mlogloss:1.22662
[19]	train-mlogloss:1.13832	eval-mlogloss:1.22186
[20]	train-mlogloss:1.12986	eval-mlogloss:1.21757
[21]	train-mlogloss:1.12191	eval-mlogloss:1.21361
[22]	train-mlogloss:1.11467	eval-mlogloss:1.20902
[23]	train-mlogloss:1.10691	eval-mlogloss:1.20510
[24]	train-mlogloss:1.09917	eval-mlogloss:1.20158
[25]	train-mlogloss:1.09165	eval-mlogloss:1.19773
[26]	train-mlogloss:1.08443	eval-mlogloss:1.19429
[27]	train-mlogloss:1.07670	eval-mlogloss:1.19083
[28]	train-mlogloss:1.06993	eval-mlogloss:1.18811
[29]	train-mlogloss:1.06352	eval-mlogloss:1.18502
[30]	train-mlogloss:1.05835	eval-mlogloss:1.18271
[31]	train-mlogloss:1.05230	eval-mlogloss:1.18003


In [121]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
1    1209
2    1203
3     283
0      97
Name: count, dtype: int64

----------------------------------------------------------------------

In [84]:
df['scaled_hashrate_diff'] = df['scaled_log_hashrate'].diff()
bar_df = df.groupby("target").agg({
    "scaled_log_hashrate":"mean",
    "scaled_hashrate_diff": "mean",
}).reset_index()
bar_df

Unnamed: 0,target,scaled_log_hashrate,scaled_hashrate_diff
0,0.0,-0.027912,-0.042294
1,1.0,-0.005848,-0.007021
2,2.0,0.016258,0.016389
3,3.0,-0.022734,-0.005294


In [85]:
fig: go.Figure = make_subplots(
    rows=1,
    cols=2,
    shared_xaxes=True,
    subplot_titles=(
        "hashrate",
        "hashrate_diff",
    ),
)
fig.add_trace(go.Bar(x=bar_df["target"], y=bar_df["scaled_log_hashrate"]), row=1, col=1)
fig.add_trace(go.Bar(x=bar_df["target"], y=bar_df["scaled_hashrate_diff"]), row=1, col=2)
fig.update_xaxes(title_text="Target")
fig.update_layout(title_text="Target statistics", showlegend=False)
fig.show()

In [11]:
hashrate_diff = pd.read_csv('hashrate_diff.csv')
hashrate_diff

Unnamed: 0,hashrate_diff
0,
1,-3.375278e+11
2,1.687639e+11
3,-1.265729e+11
4,8.438195e+10
...,...
11547,-1.051124e+11
11548,3.153372e+11
11549,0.000000e+00
11550,-2.102248e+11


In [12]:
concat_df = pd.concat([df, hashrate_diff], axis=1)
concat_df

Unnamed: 0,ID,_type,target,scaled_log_hashrate,scaled_log_open_interest,scaled_log_coinbase_premium_index,scaled_funding_rates,scaled_estimated_block_reward,scaled_liquidation_diff,scaled_log_total_liquidation,scaled_log_total_taker_volume,scaled_utxo_count,scaled_total_transactions_count,taker_buy_sell_ratio,moving_avg_scaled_log_total_volume,open_interest_diff,hashrate_diff
0,2023-01-01 00:00:00,train,2.0,0.343899,-2.074545,-1.004418,-0.190599,-2.283932,-0.013822,-1.407440,-1.507090,-1.216481,-0.616412,0.904774,-1.507090,,
1,2023-01-01 01:00:00,train,1.0,-0.479569,-2.055443,-0.920481,-0.190611,-2.271191,-0.020702,-1.089259,-1.998687,-1.216149,-1.202730,1.655721,-1.752889,1.733962e+07,-3.375278e+11
2,2023-01-01 02:00:00,train,1.0,0.039981,-2.057519,-0.982775,-0.190679,-2.285286,-0.013936,-1.414659,-2.337608,-1.216188,-1.232124,1.027512,-1.947795,-1.886794e+06,1.687639e+11
3,2023-01-01 03:00:00,train,1.0,-0.312311,-2.059965,-0.995009,-0.188985,-2.286295,-0.008301,-1.132860,-2.043796,-1.215593,-1.263915,0.874477,-1.971795,-2.221901e+06,-1.265729e+11
4,2023-01-01 04:00:00,train,2.0,-0.060107,-2.052255,-1.026415,-0.083030,-2.282372,-0.010506,-1.228120,-2.015961,-1.214962,-1.087030,0.966796,-1.980628,7.007341e+06,8.438195e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,test,,-0.369730,0.534373,-0.235206,,-1.782316,-0.032620,-1.383570,,2.350929,-0.872964,,,-2.193716e+07,-1.051124e+11
11548,2024-04-26 04:00:00,test,,0.191102,,-0.472610,,-1.918750,0.052758,-0.438363,,2.352974,,,,,3.153372e+11
11549,2024-04-26 05:00:00,test,,0.191102,,-0.157720,,-1.981323,-0.088241,-0.412978,,2.355665,,,,,0.000000e+00
11550,2024-04-26 06:00:00,test,,-0.136964,,-0.247183,,-2.150662,-0.051507,-0.990694,,2.355852,,,,,-2.102248e+11


In [86]:
# open_interest_diff drop
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = df.loc[df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy,random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = config["xgboost"]
params["objective"] = "multi:softprob"  # 다중 클래스 분류를 위한 설정
params["num_class"] = len(y_train_resampled.unique())  # 클래스 개수
params["eval_metric"] = "mlogloss"  # 손실 함수 (멀티클래스 로지스틱 손실)
params['max_depth'] = 7
params['learning_rate'] = 0.05
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8

# 학습 시 평가 데이터로 검증 셋 사용
evals = [(dtrain, "train"), (dvalid, "eval")]

# XGBoost 모델 훈련
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,  # 트리의 개수
    evals=evals,
    early_stopping_rounds=10,  # 조기 중단
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)

[0]	train-mlogloss:1.36845	eval-mlogloss:1.37269
[1]	train-mlogloss:1.35183	eval-mlogloss:1.36028
[2]	train-mlogloss:1.33538	eval-mlogloss:1.34855
[3]	train-mlogloss:1.32004	eval-mlogloss:1.33733
[4]	train-mlogloss:1.30488	eval-mlogloss:1.32656
[5]	train-mlogloss:1.29040	eval-mlogloss:1.31703
[6]	train-mlogloss:1.27649	eval-mlogloss:1.30776
[7]	train-mlogloss:1.26313	eval-mlogloss:1.29911



Parameters: { "early_stopping_rounds", "verbose" } are not used.




[8]	train-mlogloss:1.25032	eval-mlogloss:1.29096
[9]	train-mlogloss:1.23832	eval-mlogloss:1.28275
[10]	train-mlogloss:1.22640	eval-mlogloss:1.27540
[11]	train-mlogloss:1.21501	eval-mlogloss:1.26835
[12]	train-mlogloss:1.20414	eval-mlogloss:1.26139
[13]	train-mlogloss:1.19322	eval-mlogloss:1.25490
[14]	train-mlogloss:1.18290	eval-mlogloss:1.24894
[15]	train-mlogloss:1.17302	eval-mlogloss:1.24293
[16]	train-mlogloss:1.16366	eval-mlogloss:1.23778
[17]	train-mlogloss:1.15429	eval-mlogloss:1.23245
[18]	train-mlogloss:1.14523	eval-mlogloss:1.22769
[19]	train-mlogloss:1.13612	eval-mlogloss:1.22290
[20]	train-mlogloss:1.12757	eval-mlogloss:1.21823
[21]	train-mlogloss:1.11942	eval-mlogloss:1.21438
[22]	train-mlogloss:1.11217	eval-mlogloss:1.20995
[23]	train-mlogloss:1.10450	eval-mlogloss:1.20616
[24]	train-mlogloss:1.09693	eval-mlogloss:1.20275
[25]	train-mlogloss:1.08938	eval-mlogloss:1.19893
[26]	train-mlogloss:1.08205	eval-mlogloss:1.19563
[27]	train-mlogloss:1.07418	eval-mlogloss:1.19243
[2

In [87]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

array([1, 1, 2, ..., 1, 1, 3], dtype=int64)

In [88]:
submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1213
1    1209
3     270
0     100
Name: count, dtype: int64

In [91]:
submission_df.to_csv("XGBoost_scaled_smote_add_scaled_hash.csv",index=False)

In [89]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

### bsr, tv, fr diff 추가

In [27]:
diff_df = pd.read_csv('eda_df2.csv')
final_df = pd.concat([df, diff_df], axis=1)
final_df = final_df.drop(columns=[
    #'open_interest_diff', #0.414
    #'hashrate_diff', #0.421
    #'bsr_diff', #0.417
    #'bsr_ma_diff', #0.416
    'tv_diff', #0.414
    #'tv_ma_diff', #0.433
    'fr_diff', #0.416 0.434
    #'fr_ma_diff' #0.417
    ])

In [28]:
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = final_df.loc[final_df["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = final_df.loc[final_df["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류
    "num_class": len(y_train_resampled.unique()),  # 클래스 개수
    "eval_metric": "mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    "max_depth": 7,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# 교차 검증 설정 (5-fold)
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,
    nfold=5,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)

# 최적의 부스팅 라운드 수 확인
best_num_boost_rounds = len(cv_results)

# 교차 검증 결과 출력
print("최적 부스팅 라운드 수:", best_num_boost_rounds)
print(cv_results.tail())

# 최적의 부스팅 라운드로 모델 학습
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_rounds,
    evals=[(dtrain, "train"), (dvalid, "eval")],
    early_stopping_rounds=10
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


최적 부스팅 라운드 수: 200
     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
195             0.528163            0.004034            1.054295   
196             0.526586            0.004098            1.053986   
197             0.524939            0.003937            1.053719   
198             0.523536            0.004245            1.053444   
199             0.521795            0.004440            1.053108   

     test-mlogloss-std  
195           0.013811  
196           0.013820  
197           0.013998  
198           0.014096  
199           0.014118  
[0]	train-mlogloss:1.36646	eval-mlogloss:1.37245
[1]	train-mlogloss:1.34892	eval-mlogloss:1.35977
[2]	train-mlogloss:1.33179	eval-mlogloss:1.34813
[3]	train-mlogloss:1.31528	eval-mlogloss:1.33624
[4]	train-mlogloss:1.29937	eval-mlogloss:1.32572
[5]	train-mlogloss:1.28426	eval-mlogloss:1.31496
[6]	train-mlogloss:1.27029	eval-mlogloss:1.30477
[7]	train-mlogloss:1.25678	eval-mlogloss:1.29589
[8]	train-mlogloss:1.24295	eval

In [29]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

In [30]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

array([1, 1, 2, ..., 1, 1, 1], dtype=int64)

In [31]:
submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1678
1     870
3     202
0      42
Name: count, dtype: int64

---

In [78]:
submission_df.to_csv("XGBoost_scaled_smote_add_diff.csv",index=False)

-------------------------------------------------------------------

In [168]:
add_diff = pd.read_csv("XGBoost_scaled_smote_add_diff.csv")
add_diff['target'].value_counts()

target
2    1213
1    1209
3     270
0     100
Name: count, dtype: int64

In [169]:
accuracy = accuracy_score(submission_df['target'], add_diff['target'])
accuracy

0.6643982808022922

In [33]:
open = pd.read_csv("XGBoost_after_eda_open_interest.csv")
open['target'].value_counts()

target
2    1163
1    1108
0     282
3     239
Name: count, dtype: int64

In [34]:
accuracy = accuracy_score(open['target'], submission_df['target'])
accuracy

0.4409025787965616

---

In [35]:
new_feauture = pd.read_csv('new_feature.csv')
new_df = pd.concat([df, new_feauture.drop(columns=['Unnamed: 0','ID'])], axis=1)
new_df = pd.concat([new_df, diff_df], axis=1)
new_df.columns

Index(['ID', '_type', 'target', 'scaled_log_hashrate',
       'scaled_log_open_interest', 'scaled_log_coinbase_premium_index',
       'scaled_funding_rates', 'scaled_estimated_block_reward',
       'scaled_liquidation_diff', 'scaled_log_total_liquidation',
       'scaled_log_total_taker_volume', 'scaled_utxo_count',
       'scaled_total_transactions_count', 'taker_buy_sell_ratio',
       'moving_avg_scaled_log_total_volume', 'open_interest_diff',
       'network_activity_ratio_diff', 'average_transaction_value_diff',
       'network_load_diff', 'fee_burden_diff', 'market_pressure_diff',
       'liquidation_risk_diff', 'bsr_diff', 'bsr_ma_diff', 'tv_diff',
       'tv_ma_diff', 'fr_diff', 'fr_ma_diff'],
      dtype='object')

In [43]:
new_df2 = new_df.drop(columns=[
    'network_activity_ratio_diff', # 0.437
    #'average_transaction_value_diff',
    #'network_load_diff',
    'fee_burden_diff',
    #'market_pressure_diff', #0.4355
    #'liquidation_risk_diff'
    #'open_interest_diff',
    'bsr_diff', #0.417
    'bsr_ma_diff', #0.416
    'tv_diff', #0.414
    'tv_ma_diff', #0.433
    'fr_diff', #0.416 0.434
    'fr_ma_diff' #0.417
    ],inplace=False)

In [145]:
new_df2 = new_df.drop(columns=[
    #'scaled_log_hashrate',
    #'scaled_log_open_interest', 
    #'scaled_log_coinbase_premium_index',
    #'scaled_funding_rates', 
    'scaled_estimated_block_reward',
    'scaled_liquidation_diff', 
    'scaled_log_total_liquidation',
    #'scaled_log_total_taker_volume', 
    #'scaled_utxo_count',
    #'scaled_total_transactions_count', 
    #'taker_buy_sell_ratio',
    #'moving_avg_scaled_log_total_volume', 
    #'open_interest_diff',
    #'network_activity_ratio_diff', 
    #'average_transaction_value_diff',
    #'network_load_diff', 
    #'fee_burden_diff', 
    #'market_pressure_diff',
    #'liquidation_risk_diff', 
    #'bsr_diff', 
    #'bsr_ma_diff', 
    #'tv_diff',
    #'tv_ma_diff', 
    #'fr_diff', 
    #'fr_ma_diff'
    ],inplace=False)

In [146]:
# 제준님이 주신 새로운 피쳐들 사용

import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = new_df2.loc[new_df2["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = new_df2.loc[new_df2["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류
    "num_class": len(y_train_resampled.unique()),  # 클래스 개수
    "eval_metric": "mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    "max_depth": 7,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# 교차 검증 설정 (5-fold)
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=200,
    nfold=5,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)

# 최적의 부스팅 라운드 수 확인
best_num_boost_rounds = len(cv_results)

# 교차 검증 결과 출력
print("최적 부스팅 라운드 수:", best_num_boost_rounds)
print(cv_results.tail())

# 최적의 부스팅 라운드로 모델 학습
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_rounds,
    evals=[(dtrain, "train"), (dvalid, "eval")],
    early_stopping_rounds=10
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


최적 부스팅 라운드 수: 200
     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
195             0.960987            0.004034            1.155836   
196             0.959657            0.003976            1.155339   
197             0.958399            0.003994            1.154817   
198             0.957141            0.003969            1.154313   
199             0.955956            0.004001            1.153856   

     test-mlogloss-std  
195           0.008422  
196           0.008412  
197           0.008466  
198           0.008485  
199           0.008496  
[0]	train-mlogloss:1.38255	eval-mlogloss:1.38355
[1]	train-mlogloss:1.37887	eval-mlogloss:1.38088
[2]	train-mlogloss:1.37519	eval-mlogloss:1.37826
[3]	train-mlogloss:1.37159	eval-mlogloss:1.37570
[4]	train-mlogloss:1.36791	eval-mlogloss:1.37309
[5]	train-mlogloss:1.36433	eval-mlogloss:1.37043
[6]	train-mlogloss:1.36079	eval-mlogloss:1.36797
[7]	train-mlogloss:1.35724	eval-mlogloss:1.36550
[8]	train-mlogloss:1.35361	eval

In [147]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

In [148]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1515
1    1022
3     225
0      30
Name: count, dtype: int64

In [150]:
new_df2 = new_df.drop(columns=[
    'network_activity_ratio_diff', # 0.437
    #'average_transaction_value_diff',
    #'network_load_diff',
    'fee_burden_diff',
    #'market_pressure_diff', #0.4355
    #'liquidation_risk_diff'
    #'open_interest_diff',
    'bsr_diff', #0.417
    'bsr_ma_diff', #0.416
    'tv_diff', #0.414
    'tv_ma_diff', #0.433
    'fr_diff', #0.416 0.434
    'fr_ma_diff' #0.417
    ],inplace=False)

In [158]:
new_df2.columns

Index(['ID', '_type', 'target', 'scaled_log_hashrate',
       'scaled_log_open_interest', 'scaled_log_coinbase_premium_index',
       'scaled_funding_rates', 'scaled_estimated_block_reward',
       'scaled_liquidation_diff', 'scaled_log_total_liquidation',
       'scaled_log_total_taker_volume', 'scaled_utxo_count',
       'scaled_total_transactions_count', 'taker_buy_sell_ratio',
       'moving_avg_scaled_log_total_volume', 'open_interest_diff',
       'average_transaction_value_diff', 'network_load_diff',
       'market_pressure_diff', 'liquidation_risk_diff'],
      dtype='object')

In [175]:
# 제준님이 주신 새로운 피쳐들 사용 (하이퍼 파라미터 적용)

import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = new_df2.loc[new_df2["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = new_df2.loc[new_df2["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 3671, 1: 3671, 2: 3671, 3: 3671}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류
    "num_class": len(y_train_resampled.unique()),  # 클래스 개수
    "eval_metric": "mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    "max_depth": 7,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# 교차 검증 설정 (5-fold)
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=275,
    nfold=5,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)

# 최적의 부스팅 라운드 수 확인
best_num_boost_rounds = len(cv_results)

# 교차 검증 결과 출력
print("최적 부스팅 라운드 수:", best_num_boost_rounds)
print(cv_results.tail())

# 최적의 부스팅 라운드로 모델 학습
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_rounds,
    evals=[(dtrain, "train"), (dvalid, "eval")],
    early_stopping_rounds=10
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


최적 부스팅 라운드 수: 275
     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
270             0.944511            0.001327            1.124133   
271             0.943618            0.001355            1.123675   
272             0.942835            0.001331            1.123303   
273             0.941983            0.001327            1.122899   
274             0.941114            0.001326            1.122475   

     test-mlogloss-std  
270           0.004062  
271           0.004039  
272           0.004035  
273           0.004067  
274           0.004143  
[0]	train-mlogloss:1.38309	eval-mlogloss:1.38472
[1]	train-mlogloss:1.37999	eval-mlogloss:1.38320
[2]	train-mlogloss:1.37709	eval-mlogloss:1.38159
[3]	train-mlogloss:1.37430	eval-mlogloss:1.38006
[4]	train-mlogloss:1.37116	eval-mlogloss:1.37845
[5]	train-mlogloss:1.36824	eval-mlogloss:1.37710
[6]	train-mlogloss:1.36545	eval-mlogloss:1.37561
[7]	train-mlogloss:1.36264	eval-mlogloss:1.37425
[8]	train-mlogloss:1.35999	eval

In [161]:
import plotly.express as px

# Feature importance 평가
importance = xgb_model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

In [174]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1822
1     796
3     113
0      61
Name: count, dtype: int64

In [109]:
submission_df.to_csv('XGBoost_smote_cv_add_feature5.csv',index=False)

In [110]:
a = pd.read_csv('XGBoost_smote_cv_add_feature4.csv')

In [160]:
a = pd.read_csv('XGBoost_smote_cv_add_feature4.csv')
accuracy = accuracy_score(a['target'], submission_df['target'])
accuracy

0.9527220630372493

---