## 데이터 증강 / Time Series CV

### Library import

In [1]:
import os
from typing import Dict
import numpy as np
import pandas as pd

### Data Load

In [2]:
data_path: str = "../../data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path,"raw.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))

### EDA

In [3]:
df.shape

(11552, 255)

In [5]:
# 사용할 컬럼만 추출해서 eda_df 생성
eda_columns = [
    'ID', 'target', 'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 
    'hourly_network-data_hashrate_hashrate', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume', 
    'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations', 
    'hourly_market-data_coinbase-premium-index_coinbase_premium_index', 
    'hourly_market-data_funding-rates_all_exchange_funding_rates'
]

eda_df = df[eda_columns]

print(eda_df.columns)

Index(['ID', 'target',
       'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close',
       'hourly_network-data_hashrate_hashrate',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume',
       'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest',
       'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations',
       'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations',
       'hourly_market-data_coinbase-premium-index_coinbase_premium_index',
       'hourly_market-data_funding-rates_all_exchange_funding_rates'],
      dtype='object')


In [6]:
# 컬럼명 간소화
cols_dict: Dict[str, str] = {
    'ID': 'ID',
    'target': 'target',
    'hourly_network-data_hashrate_hashrate' : 'hashrate', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio' : 'taker_buy_sell_ratio', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume' : 'taker_sell_volume', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume' : 'taker_buy_volume', 
    'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest' : 'open_interest', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations' : 'long_liquidations', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations' : 'short_liquidations', 
    'hourly_market-data_coinbase-premium-index_coinbase_premium_index' : 'coinbase_premium_index', 
    'hourly_market-data_funding-rates_all_exchange_funding_rates' : 'funding_rates'
}

eda_df = eda_df[list(cols_dict.keys())].rename(cols_dict, axis=1)
eda_df.head()


Unnamed: 0,ID,target,hashrate,taker_buy_sell_ratio,taker_sell_volume,taker_buy_volume,open_interest,long_liquidations,short_liquidations,coinbase_premium_index,funding_rates
0,2023-01-01 00:00:00,2.0,506291700000.0,0.904774,51375370.0,46483090.0,6271344000.0,0.012,0.0,-0.05965,0.005049
1,2023-01-01 01:00:00,1.0,168763900000.0,1.655721,24011290.0,39755990.0,6288683000.0,0.0,0.712,-0.053047,0.005049
2,2023-01-01 02:00:00,1.0,337527800000.0,1.027512,23409950.0,24054020.0,6286796000.0,0.0,0.0,-0.057952,0.005049
3,2023-01-01 03:00:00,1.0,210954900000.0,0.874477,32707730.0,28602150.0,6284575000.0,0.593,0.0,-0.058912,0.005067
4,2023-01-01 04:00:00,2.0,295336800000.0,0.966796,31937660.0,30877200.0,6291582000.0,0.361,0.0,-0.061373,0.00621


In [7]:
# from sklearn.preprocessing import StandardScaler

# # 특정 feature standardization 수행
# standard_scaler = StandardScaler()

# eda_df[['hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']] = standard_scaler.fit_transform(
#     eda_df[['hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']]
# )

# print(eda_df[['ID', 'hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']].head())

                    ID  hashrate  taker_buy_sell_ratio  open_interest  \
0  2023-01-01 00:00:00  0.389863             -0.552604      -1.268021   
1  2023-01-01 01:00:00 -1.248180              3.479284      -1.261736   
2  2023-01-01 02:00:00 -0.429158              0.106389      -1.262420   
3  2023-01-01 03:00:00 -1.043424             -0.715270      -1.263225   
4  2023-01-01 04:00:00 -0.633914             -0.219603      -1.260686   

   funding_rates  
0      -0.343185  
1      -0.343193  
2      -0.343247  
3      -0.341936  
4      -0.259929  


In [7]:
# coinbase_premium_index 의 결측치 0으로 대체
eda_df['coinbase_premium_index'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  eda_df['coinbase_premium_index'].fillna(0, inplace=True)


### Model Training

In [8]:
# '_type' 컬럼 eda_df에 추가
eda_df['_type'] = eda_df['ID'].apply(lambda x: 'train' if x < '2024-01-01 00:00:00' else 'test')

print(eda_df[['ID', '_type']].tail())

                        ID _type
11547  2024-04-26 03:00:00  test
11548  2024-04-26 04:00:00  test
11549  2024-04-26 05:00:00  test
11550  2024-04-26 06:00:00  test
11551  2024-04-26 07:00:00  test


In [9]:
eda_df.columns

Index(['ID', 'target', 'hashrate', 'taker_buy_sell_ratio', 'taker_sell_volume',
       'taker_buy_volume', 'open_interest', 'long_liquidations',
       'short_liquidations', 'coinbase_premium_index', 'funding_rates',
       '_type'],
      dtype='object')

In [10]:
import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN, SVMSMOTE

print(imblearn.__version__)

0.12.3


### 데이터 증강 X

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

train_df = eda_df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = eda_df.loc[df["_type"]=="test"].drop(columns=["_type"])

# 특징과 타겟 변수 정의
features = [
    'hashrate', 'taker_buy_sell_ratio', 'taker_sell_volume',
    'taker_buy_volume', 'open_interest', 'long_liquidations',
    'short_liquidations', 'coinbase_premium_index', 'funding_rates'
]

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

# 데이터 분할 (훈련 데이터와 검증 데이터)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# XGBoost DMatrix 생성
train_data = xgb.DMatrix(X_train_split, label=y_train_split)
val_data = xgb.DMatrix(X_val_split, label=y_val_split)
test_data = xgb.DMatrix(X_test)

# XGBoost 파라미터 설정
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 0.05,
    'num_class': 4,
    'max_depth': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.9
}

# 모델 훈련
watchlist = [(train_data, 'train'), (val_data, 'eval')]
model = xgb.train(params, train_data, evals=watchlist)


# 검증 데이터에 대한 예측
val_preds = model.predict(val_data)
val_preds_class = val_preds.argmax(axis=1)

# 검증 데이터의 정확도 계산
val_accuracy = accuracy_score(y_val_split, val_preds_class)

# F1 Score 계산 (Macro와 Weighted)
f1_macro = f1_score(y_val_split, val_preds_class, average='macro')
f1_weighted = f1_score(y_val_split, val_preds_class, average='weighted')

# ROC-AUC 계산
y_val_binarized = label_binarize(y_val_split, classes=[0, 1, 2, 3])
roc_auc = roc_auc_score(y_val_binarized, val_preds, average='macro')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
print(f"ROC-AUC (Macro): {roc_auc:.4f}")


[0]	train-mlogloss:1.36494	eval-mlogloss:1.37108
[1]	train-mlogloss:1.34495	eval-mlogloss:1.35643
[2]	train-mlogloss:1.32572	eval-mlogloss:1.34318
[3]	train-mlogloss:1.30729	eval-mlogloss:1.33086
[4]	train-mlogloss:1.28974	eval-mlogloss:1.31883
[5]	train-mlogloss:1.27383	eval-mlogloss:1.30827
[6]	train-mlogloss:1.25821	eval-mlogloss:1.29844
[7]	train-mlogloss:1.24310	eval-mlogloss:1.28925
[8]	train-mlogloss:1.22944	eval-mlogloss:1.28023
[9]	train-mlogloss:1.21585	eval-mlogloss:1.27154
Validation Accuracy: 0.4349
F1 Score (Macro): 0.2692
F1 Score (Weighted): 0.4000
ROC-AUC (Macro): 0.6099


In [14]:
# Feature importance 평가
importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# Plotly를 이용한 시각화
import plotly.express as px

fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

In [None]:
## Validaion을 마지막 달로 변경
# # datetime 형식으로 변환 (ID 열이 날짜/시간 데이터일 경우)
# train_df['ID'] = pd.to_datetime(train_df['ID'])

# # validation 데이터의 기준 날짜 설정
# validation_start_date = '2023-11-01 00:00:00'

# # 훈련 데이터와 검증 데이터 나누기
# train_split = train_df[train_df['ID'] < validation_start_date]
# val_split = train_df[train_df['ID'] >= validation_start_date]

# # 특징과 타겟 변수 정의
# X_train_split = train_split[features]
# y_train_split = train_split['target']

# X_val_split = val_split[features]
# y_val_split = val_split['target']

### 데이터증강 RandomOverSampler 적용

In [15]:
# 데이터 분할
train_df = eda_df.loc[eda_df["_type"]=="train"].drop(columns=["_type"])
test_df = eda_df.loc[eda_df["_type"]=="test"].drop(columns=["_type"])

# 타겟 변수 정의
features = [
    'hashrate', 'taker_buy_sell_ratio', 'taker_sell_volume',
    'taker_buy_volume', 'open_interest', 'long_liquidations',
    'short_liquidations', 'coinbase_premium_index', 'funding_rates'
]

# X_train = train_df[features]
# y_train = train_df['target']
# X_test = test_df[features]

# # 데이터 분할 (훈련 데이터와 검증 데이터)
# X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
# )

# Validaion을 마지막 달로 변경
# datetime 형식으로 변환 (ID 열이 날짜/시간 데이터일 경우)
train_df['ID'] = pd.to_datetime(train_df['ID'])

# validation 데이터의 기준 날짜 설정
validation_start_date = '2023-10-15 00:00:00'

# 훈련 데이터와 검증 데이터 나누기
train_split = train_df[train_df['ID'] < validation_start_date]
val_split = train_df[train_df['ID'] >= validation_start_date]

# 특징과 타겟 변수 정의
X_train_split = train_split[features]
y_train_split = train_split['target']

X_val_split = val_split[features]
y_val_split = val_split['target']

# BorderlineSMOTE 적용
category_dict = {0: 1000, 1: 3544, 2: 3671, 3: 1500}
smote = RandomOverSampler(random_state=42, sampling_strategy= category_dict)
X_train_res, y_train_res = smote.fit_resample(X_train_split, y_train_split)

# XGBoost DMatrix 생성
train_data = xgb.DMatrix(X_train_res, label=y_train_res)
val_data = xgb.DMatrix(X_val_split, label=y_val_split)
test_data = xgb.DMatrix(X_test)

# XGBoost 파라미터 설정
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 0.05,
    'num_class': 4,
    'max_depth': 6,
    'subsample': 0.9,
    'colsample_bytree': 0.9
}

# 모델 훈련
watchlist = [(train_data, 'train'), (val_data, 'eval')]
model = xgb.train(params, train_data, evals=watchlist)

# 검증 데이터에 대한 예측
val_preds_proba = model.predict(val_data)
val_preds_class = np.argmax(val_preds_proba, axis=1)

# Accuracy 계산
val_accuracy = accuracy_score(y_val_split, val_preds_class)

# F1 Score 계산 (Macro와 Weighted)
f1_macro = f1_score(y_val_split, val_preds_class, average='macro')
f1_weighted = f1_score(y_val_split, val_preds_class, average='weighted')

# ROC-AUC 계산
y_val_binarized = label_binarize(y_val_split, classes=[0, 1, 2, 3])
roc_auc = roc_auc_score(y_val_binarized, val_preds_proba, average='macro', multi_class='ovr')

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
print(f"ROC-AUC (Macro): {roc_auc:.4f}")


[0]	train-mlogloss:1.36837	eval-mlogloss:1.38562
[1]	train-mlogloss:1.35199	eval-mlogloss:1.38155
[2]	train-mlogloss:1.33585	eval-mlogloss:1.38271
[3]	train-mlogloss:1.32099	eval-mlogloss:1.37972
[4]	train-mlogloss:1.30642	eval-mlogloss:1.37799
[5]	train-mlogloss:1.29281	eval-mlogloss:1.37783
[6]	train-mlogloss:1.28000	eval-mlogloss:1.37824
[7]	train-mlogloss:1.26752	eval-mlogloss:1.37891
[8]	train-mlogloss:1.25613	eval-mlogloss:1.37535
[9]	train-mlogloss:1.24486	eval-mlogloss:1.37612
Validation Accuracy: 0.2473
F1 Score (Macro): 0.2157
F1 Score (Weighted): 0.2787
ROC-AUC (Macro): 0.5093


In [16]:
# Feature importance 평가
importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# Plotly를 이용한 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

### Time Series CV 적용

In [17]:
from sklearn.model_selection import TimeSeriesSplit

# TimeSeriesSplit 객체 생성
tscv = TimeSeriesSplit(n_splits=6)  # 5개로 데이터를 나눠 교차 검증

# 모델 성능을 저장할 리스트
val_accuracies = []
f1_macros = []
f1_weighteds = []
roc_aucs = []

# TimeSeriesSplit을 이용한 교차 검증
for train_index, val_index in tscv.split(X_train):
    X_train_split, X_val_split = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_split, y_val_split = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # 사용
    category_dict = {0: 1000, 1: 3544, 2: 3671, 3: 2000}
    smote = RandomOverSampler(random_state=42, sampling_strategy= category_dict)
    X_train_res, y_train_res = smote.fit_resample(X_train_split, y_train_split)

    # XGBoost DMatrix 생성
    train_data = xgb.DMatrix(X_train_res, label=y_train_res)
    val_data = xgb.DMatrix(X_val_split, label=y_val_split)

    # XGBoost 파라미터 설정
    params = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'learning_rate': 0.05,
        'num_class': 4,
        'max_depth': 6,
        'subsample': 0.9,
        'colsample_bytree': 0.9
    }

    # 모델 훈련
    watchlist = [(train_data, 'train'), (val_data, 'eval')]
    model = xgb.train(params, train_data, evals=watchlist)

    # 검증 데이터에 대한 예측
    val_preds_proba = model.predict(val_data)
    val_preds_class = np.argmax(val_preds_proba, axis=1)

    # Accuracy 계산
    val_accuracy = accuracy_score(y_val_split, val_preds_class)
    val_accuracies.append(val_accuracy)

    # F1 Score 계산 (Macro와 Weighted)
    f1_macro = f1_score(y_val_split, val_preds_class, average='macro')
    f1_weighted = f1_score(y_val_split, val_preds_class, average='weighted')
    f1_macros.append(f1_macro)
    f1_weighteds.append(f1_weighted)

    # ROC-AUC 계산
    y_val_binarized = label_binarize(y_val_split, classes=[0, 1, 2, 3])
    roc_auc = roc_auc_score(y_val_binarized, val_preds_proba, average='macro', multi_class='ovr')
    roc_aucs.append(roc_auc)

# 각 fold에 대한 평균 성능 출력
print(f"Average Validation Accuracy: {np.mean(val_accuracies):.4f}")
print(f"Average F1 Score (Macro): {np.mean(f1_macros):.4f}")
print(f"Average F1 Score (Weighted): {np.mean(f1_weighteds):.4f}")
print(f"Average ROC-AUC (Macro): {np.mean(roc_aucs):.4f}")


[0]	train-mlogloss:1.35634	eval-mlogloss:1.37917
[1]	train-mlogloss:1.32693	eval-mlogloss:1.37038
[2]	train-mlogloss:1.29997	eval-mlogloss:1.36453
[3]	train-mlogloss:1.27270	eval-mlogloss:1.36020
[4]	train-mlogloss:1.24796	eval-mlogloss:1.35600
[5]	train-mlogloss:1.22491	eval-mlogloss:1.35187
[6]	train-mlogloss:1.20221	eval-mlogloss:1.34720
[7]	train-mlogloss:1.18078	eval-mlogloss:1.34573
[8]	train-mlogloss:1.16153	eval-mlogloss:1.34105
[9]	train-mlogloss:1.14241	eval-mlogloss:1.34027
[0]	train-mlogloss:1.36133	eval-mlogloss:1.37162
[1]	train-mlogloss:1.33938	eval-mlogloss:1.35928
[2]	train-mlogloss:1.31736	eval-mlogloss:1.34836
[3]	train-mlogloss:1.29654	eval-mlogloss:1.33850
[4]	train-mlogloss:1.27539	eval-mlogloss:1.32925
[5]	train-mlogloss:1.25534	eval-mlogloss:1.32041
[6]	train-mlogloss:1.23599	eval-mlogloss:1.31154
[7]	train-mlogloss:1.21834	eval-mlogloss:1.30198
[8]	train-mlogloss:1.20173	eval-mlogloss:1.29438
[9]	train-mlogloss:1.18503	eval-mlogloss:1.28670
[0]	train-mlogloss:1

In [18]:
# Feature importance 평가
importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# Plotly를 이용한 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()

In [228]:
# 테스트 데이터에 대한 예측
test_preds = model.predict(test_data)
test_preds_class = test_preds.argmax(axis=1)

# 예측 결과를 submission_df에 추가
submission_df['target'] = test_preds_class
#submission_df.to_csv("output_xgboost_5.csv", index=False)

In [229]:
import plotly.express as px

# submission_df['target']의 분포 시각화
fig = px.histogram(submission_df, x='target', title='Distribution of Target in Submission Data')
fig.show()

In [88]:
fig = px.histogram(eda_df, x='target', title='Distribution of Target in Submission Data')
fig.show()