In [33]:
import os
import sys
from typing import List, Dict

import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
import plotly.express as px

# Code 경로 추가
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(""))))
print(sys.path[-1])

c:\Users\com\Desktop\AIT Project1\level1-classificationinmachinelearning-recsys-06


In [34]:
# 파일 호출
data_path: str = "..\..\data"
## raw.csv가 없는 경우 실행
# from Code.dataset.merge_all import merge_all
# df = merge_all(data_path)
data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "raw.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))  # ID, target 열만 가진 데이터 미리 호출

  data_path: str = "..\..\data"


In [35]:
# 시차(Lag) 특성을 생성할 컬럼 리스트 정의
columns_to_lag = [
    'hourly_market-data_coinbase-premium-index_coinbase_premium_gap',
    'hourly_market-data_coinbase-premium-index_coinbase_premium_index',
    'hourly_market-data_funding-rates_all_exchange_funding_rates',
    'hourly_market-data_funding-rates_binance_funding_rates',
    'hourly_market-data_funding-rates_bitmex_funding_rates',
    'hourly_network-data_tokens-transferred_tokens_transferred_median',
    'hourly_network-data_transactions-count_transactions_count_total',
    'hourly_network-data_utxo-count_utxo_count'
]

In [36]:
# 시차 (Lag) 특성 생성 함수 정의
def create_lag_features(df, columns, max_lag):
    df = df.copy()
    for col in columns:
        for lag in range(1, max_lag + 1):
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    return df

In [37]:
# 최대 시차 생성
max_lag = 3

# 시차 특성 생성
data_with_lags = create_lag_features(data, columns_to_lag, max_lag)

data_with_lags

Unnamed: 0,ID,target,_type,hourly_market-data_coinbase-premium-index_coinbase_premium_gap,hourly_market-data_coinbase-premium-index_coinbase_premium_index,hourly_market-data_funding-rates_all_exchange_funding_rates,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_funding-rates_bitmex_funding_rates,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_funding-rates_deribit_funding_rates,...,hourly_market-data_funding-rates_bitmex_funding_rates_lag_3,hourly_network-data_tokens-transferred_tokens_transferred_median_lag_1,hourly_network-data_tokens-transferred_tokens_transferred_median_lag_2,hourly_network-data_tokens-transferred_tokens_transferred_median_lag_3,hourly_network-data_transactions-count_transactions_count_total_lag_1,hourly_network-data_transactions-count_transactions_count_total_lag_2,hourly_network-data_transactions-count_transactions_count_total_lag_3,hourly_network-data_utxo-count_utxo_count_lag_1,hourly_network-data_utxo-count_utxo_count_lag_2,hourly_network-data_utxo-count_utxo_count_lag_3
0,2023-01-01 00:00:00,2.0,train,-9.86,-0.059650,0.005049,0.010000,0.001400,0.01,0.000571,...,,,,,,,,,,
1,2023-01-01 01:00:00,1.0,train,-8.78,-0.053047,0.005049,0.010000,0.001400,0.01,0.000570,...,,0.020125,,,11457.0,,,83308092.0,,
2,2023-01-01 02:00:00,1.0,train,-9.59,-0.057952,0.005049,0.010000,0.001400,0.01,0.000566,...,,0.021293,0.020125,,5832.0,11457.0,,83314883.0,83308092.0,
3,2023-01-01 03:00:00,1.0,train,-9.74,-0.058912,0.005067,0.010000,0.001518,0.01,0.000557,...,0.0014,0.022980,0.021293,0.020125,5550.0,5832.0,11457.0,83314090.0,83314883.0,83308092.0
4,2023-01-01 04:00:00,2.0,train,-10.14,-0.061373,0.006210,0.010000,0.008400,0.01,0.000536,...,0.0014,0.024678,0.022980,0.021293,5245.0,5550.0,5832.0,83326258.0,83314090.0,83314883.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,-1.53,-0.002376,,0.004867,,,,...,,0.003008,0.001942,0.004097,13621.0,18553.0,20329.0,179811932.0,179793126.0,179753959.0
11548,2024-04-26 04:00:00,,test,-11.73,-0.018268,,0.006169,,,,...,,0.004827,0.003008,0.001942,10048.0,13621.0,18553.0,179820708.0,179811932.0,179793126.0
11549,2024-04-26 05:00:00,,test,1.85,0.002866,,,,,,...,,0.002810,0.004827,0.003008,,10048.0,13621.0,179833897.0,179820708.0,179811932.0
11550,2024-04-26 06:00:00,,test,-2.05,-0.003184,,,,,,...,,0.001104,0.002810,0.004827,,,10048.0,179851249.0,179833897.0,179820708.0


In [38]:
# 이동 평균 (Moving Average) 특성 생성 함수 정의
def create_moving_average_features(df, columns, windows):
    df = df.copy()
    for col in columns:
        for window in windows:
            df[f'{col}_ma_{window}'] = df[col].rolling(window=window).mean()
    return df

In [40]:
# 이동 평균을 계산할 윈도우 크기 설정
windows = [3,6,12]

# 이동 평균 특성 생성
data_with_moving_average = create_moving_average_features(data_with_lags, columns_to_lag, windows)
data_with_moving_average


Unnamed: 0,ID,target,_type,hourly_market-data_coinbase-premium-index_coinbase_premium_gap,hourly_market-data_coinbase-premium-index_coinbase_premium_index,hourly_market-data_funding-rates_all_exchange_funding_rates,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_funding-rates_bitmex_funding_rates,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_funding-rates_deribit_funding_rates,...,hourly_market-data_funding-rates_bitmex_funding_rates_ma_12,hourly_network-data_tokens-transferred_tokens_transferred_median_ma_3,hourly_network-data_tokens-transferred_tokens_transferred_median_ma_6,hourly_network-data_tokens-transferred_tokens_transferred_median_ma_12,hourly_network-data_transactions-count_transactions_count_total_ma_3,hourly_network-data_transactions-count_transactions_count_total_ma_6,hourly_network-data_transactions-count_transactions_count_total_ma_12,hourly_network-data_utxo-count_utxo_count_ma_3,hourly_network-data_utxo-count_utxo_count_ma_6,hourly_network-data_utxo-count_utxo_count_ma_12
0,2023-01-01 00:00:00,2.0,train,-9.86,-0.059650,0.005049,0.010000,0.001400,0.01,0.000571,...,,,,,,,,,,
1,2023-01-01 01:00:00,1.0,train,-8.78,-0.053047,0.005049,0.010000,0.001400,0.01,0.000570,...,,,,,,,,,,
2,2023-01-01 02:00:00,1.0,train,-9.59,-0.057952,0.005049,0.010000,0.001400,0.01,0.000566,...,,0.021466,,,7613.000000,,,8.331236e+07,,
3,2023-01-01 03:00:00,1.0,train,-9.74,-0.058912,0.005067,0.010000,0.001518,0.01,0.000557,...,,0.022983,,,5542.333333,,,8.331841e+07,,
4,2023-01-01 04:00:00,2.0,train,-10.14,-0.061373,0.006210,0.010000,0.008400,0.01,0.000536,...,,0.025418,,,5912.333333,,,8.332651e+07,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,-1.53,-0.002376,,0.004867,,,,...,,0.003259,0.002513,0.003276,14074.000000,13372.666667,20616.083333,1.798086e+08,1.797712e+08,1.797049e+08
11548,2024-04-26 04:00:00,,test,-11.73,-0.018268,,0.006169,,,,...,,0.003548,0.002781,0.002656,,,,1.798222e+08,1.797904e+08,1.797242e+08
11549,2024-04-26 05:00:00,,test,1.85,0.002866,,,,,,...,,0.002914,0.002965,0.002024,,,,1.798353e+08,1.798108e+08,1.797426e+08
11550,2024-04-26 06:00:00,,test,-2.05,-0.003184,,,,,,...,,0.001952,0.002606,0.002066,,,,1.798459e+08,1.798272e+08,1.797598e+08


In [41]:
# 추가적인 시계열 특성 생성 함수 정의
def create_additional_ts_features(df, columns, windows):
    df = df.copy()
    for col in columns:
        for window in windows:
            # 이동 평균 편차 (Volatility)
            df[f'{col}_std_{window}'] = df[col].rolling(window=window).std()
            # 차분 (Difference)
            df[f'{col}_diff_{window}'] = df[col] - df[col].shift(window)
            # 지수 이동 평균 (EMA)
            df[f'{col}_ema_{window}'] = df[col].ewm(span=window, adjust=False).mean()
    return df

In [42]:
# 결측값 제거 함수 (시계열 특성에서 발생한 결측값만 제거)
def remove_ts_related_missing_values(df, columns, max_lag):
    # 시차(Lag) 및 시계열 특성으로 인한 결측값을 처리할 컬럼 정의
    lagged_columns = [f'{col}_lag_{lag}' for col in columns for lag in range(1, max_lag + 1)]
    moving_avg_columns = [f'{col}_ma_{window}' for col in columns for window in [3, 6, 12]]
    volatility_columns = [f'{col}_std_{window}' for col in columns for window in [3, 6, 12]]
    
    # 해당 컬럼들에서 발생한 결측값만 제거
    relevant_columns = lagged_columns + moving_avg_columns + volatility_columns
    df_cleaned = df.dropna(subset=relevant_columns).reset_index(drop=True)
    
    return df_cleaned

In [71]:
# 추가적인 시계열 특성 생성
data_with_ts_features = create_additional_ts_features(data_with_moving_average, columns_to_lag, windows)

# 최종 데이터셋에서 결측값 제거 (시계열 특성 생성으로 인한 결측값)
data_final = remove_ts_related_missing_values(data_with_ts_features,columns_to_lag,max_lag)

In [82]:
_target = data_final["target"]

# _type이 train인 데이터의 결측치만 삭제
df1 = remove_ts_related_missing_values(data_with_ts_features[data_with_ts_features['_type']=='train'],columns_to_lag,max_lag)

# _type이 test인 데이터는 결측치를 삭제하지 않고 그대로 유지
df2 = data_with_ts_features[data_with_ts_features['_type']=='test'].ffill().assign(target=_target)

# train과 test 데이터를 다시 합침
final_data = pd.concat([df1, df2], axis=0).reset_index(drop=True)

# 결과 확인
print(f"train 데이터: {len(df1)}, test 데이터: {len(df2)}, 최종 데이터: {len(final_data)}")


train 데이터: 8710, test 데이터: 2792, 최종 데이터: 11502


In [84]:
# XGBoost 모델 훈련 및 평가를 위해 데이터 분할
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# _type에 따라 train, test 분리
train_df = final_data.loc[final_data["_type"] == "train"].drop(columns=["_type"])
test_df = final_data.loc[final_data["_type"] == "test"].drop(columns=["_type"])

# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis=1),
    train_df["target"].astype(int),
    test_size=0.2,
    random_state=42,
)

# XGBoost 모델 생성 및 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# XGBoost 모델 훈련
xgb_model.fit(x_train, y_train)

# 예측
y_train_pred = xgb_model.predict(x_train)
y_valid_pred = xgb_model.predict(x_valid)

# 성능 평가 (훈련)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_classification_rep = classification_report(y_train, y_train_pred)

# 성능 평가 (검증)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_classification_rep = classification_report(y_valid, y_valid_pred)

print(f"훈련 정확도: {train_accuracy}")
print(train_classification_rep)
print(f"검증 정확도: {valid_accuracy}")
print(valid_classification_rep)

훈련 정확도: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       600
           1       1.00      1.00      1.00      2813
           2       1.00      1.00      1.00      2926
           3       1.00      1.00      1.00       629

    accuracy                           1.00      6968
   macro avg       1.00      1.00      1.00      6968
weighted avg       1.00      1.00      1.00      6968

검증 정확도: 0.4305396096440873
              precision    recall  f1-score   support

           0       0.26      0.06      0.10       134
           1       0.45      0.49      0.47       712
           2       0.43      0.53      0.47       721
           3       0.30      0.07      0.12       175

    accuracy                           0.43      1742
   macro avg       0.36      0.29      0.29      1742
weighted avg       0.41      0.43      0.41      1742



In [85]:
print(test_df)

                        ID  target  \
8710   2024-01-01 00:00:00     NaN   
8711   2024-01-01 01:00:00     NaN   
8712   2024-01-01 02:00:00     NaN   
8713   2024-01-01 03:00:00     NaN   
8714   2024-01-01 04:00:00     NaN   
...                    ...     ...   
11497  2024-04-26 03:00:00     NaN   
11498  2024-04-26 04:00:00     NaN   
11499  2024-04-26 05:00:00     NaN   
11500  2024-04-26 06:00:00     NaN   
11501  2024-04-26 07:00:00     NaN   

       hourly_market-data_coinbase-premium-index_coinbase_premium_gap  \
8710                                              -22.57                
8711                                              -18.88                
8712                                               -9.78                
8713                                               -5.38                
8714                                              -10.22                
...                                                  ...                
11497                           

In [86]:
y_test_pred = xgb_model.predict(test_df.drop(["target", "ID"],axis=1))

In [87]:
len(y_test_pred)

2792

In [88]:
submission_df = submission_df.assign(target=pd.DataFrame(y_test_pred))
submission_df

Unnamed: 0,ID,target
0,2024-01-01 00:00:00,2
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,2
3,2024-01-01 03:00:00,2
4,2024-01-01 04:00:00,2
...,...,...
2787,2024-04-26 03:00:00,0
2788,2024-04-26 04:00:00,0
2789,2024-04-26 05:00:00,0
2790,2024-04-26 06:00:00,0


In [90]:
submission_df['target'].value_counts()

target
1    1267
2     864
0     622
3      39
Name: count, dtype: int64

In [100]:
fig = make_subplots(rows=1, cols=1)

# target의 고유 값과 해당 값의 빈도를 구함
target_counts = submission_df['target'].value_counts()

fig.add_trace(go.Bar(x=target_counts.index, y=target_counts.values))
fig.show()

In [101]:
submission_df.to_csv("xgb_test1_lag.csv",index=False)