In [1]:
import os
import sys
from typing import List, Dict

import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Code 경로 추가
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
print(sys.path[-1])

c:\Users\com\Documents\GitHub\level1-classificationinmachinelearning-recsys-06


In [2]:
# 파일 호출
data_path: str = "../../data"
## raw.csv가 없는 경우 실행
# from Code.dataset.merge_all import merge_all
# df = merge_all(data_path)
df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "scaled_eda2.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))  # ID, target 열만 가진 데이터 미리 호출

In [3]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, SVMSMOTE

def get_over_sampler(x_train, y_train, strategy='ADASYN', sampling_strategy='auto'):
    if strategy == 'SMOTE':
        sampler = SMOTE(sampling_strategy=sampling_strategy)
    elif strategy == 'BorderlineSMOTE':
        sampler = BorderlineSMOTE(sampling_strategy=sampling_strategy)
    elif strategy == 'ADASYN':
        sampler = ADASYN(sampling_strategy=sampling_strategy)
    elif strategy == 'SVMSMOTE':
        sampler = SVMSMOTE(sampling_strategy=sampling_strategy)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

    x_resampled, y_resampled = sampler.fit_resample(x_train, y_train)
    return x_resampled, y_resampled

In [4]:
new_feauture = pd.read_csv('new_feature.csv')
new_df = pd.concat([df, new_feauture.drop(columns=['Unnamed: 0','ID'])], axis=1)
new_df.columns

Index(['ID', '_type', 'target', 'scaled_log_hashrate',
       'scaled_log_open_interest', 'scaled_log_coinbase_premium_index',
       'scaled_funding_rates', 'scaled_estimated_block_reward',
       'scaled_liquidation_diff', 'scaled_log_total_liquidation',
       'scaled_log_total_taker_volume', 'scaled_utxo_count',
       'scaled_total_transactions_count', 'taker_buy_sell_ratio',
       'moving_avg_scaled_log_total_volume', 'open_interest_diff',
       'network_activity_ratio_diff', 'average_transaction_value_diff',
       'network_load_diff', 'fee_burden_diff', 'market_pressure_diff',
       'liquidation_risk_diff'],
      dtype='object')

In [5]:
new_df2 = new_df.drop(columns=[
    'network_activity_ratio_diff',
    #'average_transaction_value_diff',
    #'network_load_diff',
    'fee_burden_diff',
    #'market_pressure_diff',
    #'liquidation_risk_diff'
    ],inplace=False)

In [40]:
# 제준님이 주신 새로운 피쳐들 사용 (하이퍼 파라미터 적용)

import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# _type에 따라 train, test 분리
train_df = new_df2.loc[new_df2["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill()
test_df = new_df2.loc[new_df2["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# XGBoost 모델을 위한 DMatrix로 변환
dtrain = xgb.DMatrix(x_train_resampled, label=y_train_resampled)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

# XGBoost 모델 학습 파라미터
params = {
    "objective": "multi:softprob",  # 다중 클래스 분류
    "num_class": len(y_train_resampled.unique()),  # 클래스 개수
    "eval_metric": "mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    "max_depth": 7,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# 교차 검증 설정 (5-fold)
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=275,
    nfold=5,
    early_stopping_rounds=10,
    metrics="mlogloss",
    as_pandas=True,
    seed=42
)

# 최적의 부스팅 라운드 수 확인
best_num_boost_rounds = len(cv_results)

# 교차 검증 결과 출력
print("최적 부스팅 라운드 수:", best_num_boost_rounds)
print(cv_results.tail())

# 최적의 부스팅 라운드로 모델 학습
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=best_num_boost_rounds,
    evals=[(dtrain, "train"), (dvalid, "eval")],
    early_stopping_rounds=10
)

# 예측 (훈련 데이터셋)
y_train_pred_prob = xgb_model.predict(dtrain)
y_train_pred = y_train_pred_prob.argmax(axis=1)

# 예측 (검증 데이터셋)
y_valid_pred_prob = xgb_model.predict(dvalid)
y_valid_pred = y_valid_pred_prob.argmax(axis=1)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


최적 부스팅 라운드 수: 275
     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
270             0.892901            0.003722            1.136598   
271             0.891928            0.003711            1.136278   
272             0.890998            0.003705            1.136007   
273             0.890115            0.003736            1.135770   
274             0.889159            0.003706            1.135462   

     test-mlogloss-std  
270           0.010328  
271           0.010333  
272           0.010360  
273           0.010366  
274           0.010407  
[0]	train-mlogloss:1.38253	eval-mlogloss:1.38359
[1]	train-mlogloss:1.37885	eval-mlogloss:1.38078
[2]	train-mlogloss:1.37524	eval-mlogloss:1.37818
[3]	train-mlogloss:1.37162	eval-mlogloss:1.37551
[4]	train-mlogloss:1.36797	eval-mlogloss:1.37272
[5]	train-mlogloss:1.36441	eval-mlogloss:1.37005
[6]	train-mlogloss:1.36075	eval-mlogloss:1.36755
[7]	train-mlogloss:1.35740	eval-mlogloss:1.36512
[8]	train-mlogloss:1.35395	eval

In [28]:
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# _type에 따라 train, test 분리
train_df = new_df2.loc[new_df2["_type"] == "train"].drop(columns=["_type"])
train_df = train_df.ffill().dropna()
test_df = new_df2.loc[new_df2["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# SMOTE를 이용한 오버샘플링
sampling_strategy = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [32]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# XGBoost 모델 (하이퍼파라미터는 그대로 사용)
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",  # 다중 클래스 분류
    num_class=len(y_train_resampled.unique()),  # 클래스 개수
    eval_metric="mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=42
)

# 추가 모델들 (로지스틱 회귀 및 랜덤 포레스트)
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lgb_model = LGBMClassifier()
gbm_model = GradientBoostingClassifier()
svm_model = SVC(probability=True)

# Voting Classifier 앙상블 (하드 보팅: voting='hard' 또는 소프트 보팅: voting='soft')
ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('log', log_model),
        ('rf', rf_model),
        ('lgb',lgb_model),
        ('gbm',gbm_model),
        ('svm',svm_model),
    ],
    voting='soft'  # 확률을 기반으로 투표
)

# 앙상블 모델 학습
ensemble_model.fit(x_train_resampled, y_train_resampled)

# 예측 (훈련 데이터셋)
y_train_pred = ensemble_model.predict(x_train_resampled)

# 예측 (검증 데이터셋)
y_valid_pred = ensemble_model.predict(x_valid)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000896 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 9715, number of used features: 17
[LightGBM] [Info] Start training from score -2.273671
[LightGBM] [Info] Start training from score -1.008415
[LightGBM] [Info] Start training from score -0.973207
[LightGBM] [Info] Start training from score -1.868206
훈련 정확도: 0.9418425115800309
              precision    recall  f1-score   support

           0       1.00      0.72      0.84      1000
           1       0.94      0.99      0.97      3544
           2       0.91      0.99      0.95      3671
           3       0.99      0.84      0.91      1500

    accuracy                           0.94      9715
   macro avg       0.96      0.89      0.92      9715
weighted avg       0.95  

In [35]:
# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = VotingClassifier.predict(test_df)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

TypeError: VotingClassifier.predict() missing 1 required positional argument: 'X'

In [33]:
from sklearn.ensemble import StackingClassifier

# XGBoost 모델 (하이퍼파라미터는 그대로 사용)
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",  # 다중 클래스 분류
    num_class=len(y_train_resampled.unique()),  # 클래스 개수
    eval_metric="mlogloss",  # 손실 함수 (멀티클래스 로지스틱 손실)
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=42
)

# 추가 모델들 (로지스틱 회귀 및 랜덤 포레스트)
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lgb_model = LGBMClassifier()
gbm_model = GradientBoostingClassifier()
svm_model = SVC(probability=True)

# 스태킹 앙상블 모델 (최종 메타 모델로 로지스틱 회귀 사용)
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('log', log_model),
        ('rf', rf_model),
        ('lgb',lgb_model),
        ('gbm',gbm_model),
        ('svm',svm_model),
    ],
    final_estimator=LogisticRegression()  # 메타 모델
)

# 앙상블 모델 학습
stacking_model.fit(x_train_resampled, y_train_resampled)

# 예측 (훈련 데이터셋)
y_train_pred = stacking_model.predict(x_train_resampled)

# 예측 (검증 데이터셋)
y_valid_pred = stacking_model.predict(x_valid)

# 성능 평가 (훈련 데이터셋)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_classification_rep = classification_report(y_train_resampled, y_train_pred)

# 성능 평가 (검증 데이터셋)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

# 결과 출력
print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 9715, number of used features: 17
[LightGBM] [Info] Start training from score -2.273671
[LightGBM] [Info] Start training from score -1.008415
[LightGBM] [Info] Start training from score -0.973207
[LightGBM] [Info] Start training from score -1.868206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 7772, number of used features: 17
[LightGBM] [Info] Start training from score -2.273671
[LightGBM] [Info] Start training from score -1.008133
[LightGBM] [Info] Start training from score -0.973479
[LightGBM] [Info] Start training from score -1

In [17]:
# 테스트 데이터셋을 DMatrix로 변환
dtest = xgb.DMatrix(test_df.drop(["target", "ID"], axis=1))

# 테스트 데이터에 대한 예측 수행
y_test_pred_prob = xgb_model.predict(dtest)

# 다중 클래스 예측일 경우 가장 높은 확률을 가진 클래스를 예측값으로 변환
y_test_pred = y_test_pred_prob.argmax(axis=1)

# 예측 결과 출력
y_test_pred

submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df['target'].value_counts()

target
2    1815
1     789
3     128
0      60
Name: count, dtype: int64