In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from dataset_generator import DatasetGenerator
import xgboost as xgb

# Data Load

In [2]:
# 원본 데이터
data_path = "../data"
generator = DatasetGenerator(data_path)
df = generator.generate_dataset()
df.head()

100%|██████████| 107/107 [00:09<00:00, 11.51it/s]


Unnamed: 0,ID,target,_type,coinbase_premium_gap,coinbase_premium_index,funding_rates,funding_rates_bitmex,long_liquidations,long_liquidations_usd,short_liquidations,...,fees_transaction_median,fees_transaction_median_usd,blockreward,blockreward_usd,block_interval,tokens_transferred_total,tokens_transferred_mean,tokens_transferred_median,block_bytes,velocity_supply_total
0,2023-01-01 00:00:00,2.0,train,-9.86,-0.05965,0.005049,0.0014,0.012,197.5161,0.0,...,1.4e-05,0.235973,75.561037,1248565.0,427.333333,33057.024011,2.885312,0.020125,783554.416667,65.978971
1,2023-01-01 01:00:00,1.0,train,-8.78,-0.053047,0.005049,0.0014,0.0,0.0,0.712,...,1.4e-05,0.237108,25.256248,417632.2,782.5,12933.965951,2.217758,0.021293,890760.5,65.977755
2,2023-01-01 02:00:00,1.0,train,-9.59,-0.057952,0.005049,0.0014,0.0,0.0,0.0,...,1.4e-05,0.234878,50.312978,832173.5,365.125,26960.250177,4.857703,0.02298,558209.75,65.97557
3,2023-01-01 03:00:00,1.0,train,-9.74,-0.058912,0.005067,0.001518,0.593,9754.76891,0.0,...,1.4e-05,0.234608,31.469829,519975.4,667.8,32747.924338,6.243646,0.024678,640371.8,65.975092
4,2023-01-01 04:00:00,2.0,train,-10.14,-0.061373,0.00621,0.0084,0.361,5944.43714,0.0,...,1.4e-05,0.237996,44.094866,728603.8,703.0,26550.999095,3.82469,0.028596,712739.0,65.969825


# 결측치 처리
파생 변수를 생성하기 전에 결측치를 처리

In [3]:
train_df = df[df._type == 'train'].drop(columns = ['_type', 'target', 'ID'])
test_df= df[df._type == 'test'].drop(columns = ['_type', 'target', 'ID'])

In [4]:
# MICE
iimp = IterativeImputer(
estimator = xgb.XGBRegressor(),
random_state = 42,
verbose = 0,
max_iter=15
)

imp_train_df = iimp.fit_transform(train_df)
imp_test_df = iimp.transform(test_df)



In [6]:
train_df = pd.DataFrame(imp_train_df, columns = train_df.columns)
test_df = pd.DataFrame(imp_test_df, columns = test_df.columns)

df2 = pd.concat([train_df, test_df], axis = 0).reset_index(drop=True)
df2[['ID', 'target','_type']] = df[['ID', 'target','_type']] 

In [7]:
# 파생 변수 생성
df3, category_cols, conti_cols = generator.feature_engineering(df2)

# 변수 선택 (correlation)

In [13]:
# 수미가 나중에 추가한 변수
drop_cols0 = ['hashrate_difficulty_reward_ratio', 'fees_activity_ratio', 'tokens_fee_ratio', 'block_interval_difficulty_reward_ratio', 'taker_interest_ratio', 'premium_liquidation_ratio', 'velocity_supply_ratio', 'funding_taker_ratio', 'M', 'Is_Afternoon_Evening', 'Is_Weekend']

# 파생 변수 생성에만 필요했던 변수들은 제거
drop_cols1 = ['long_liquidations_bybit', 'short_liquidations_bybit', 
             'long_liquidations_bitfinex', 'short_liquidations_bitfinex', 
             'long_liquidations_binance', 'short_liquidations_binance',
             'buy_volume_deribit', 'sell_volume_deribit',
             'buy_volume_bybit', 'sell_volume_bybit',
             'buy_volume_okx', 'sell_volume_okx',
             ]

# 원본 데이터 중에 corr 0.9 이상
drop_cols2 = [
    'coinbase_premium_gap', 
    'long_liquidations_usd', 'short_liquidations_usd', 
    'buy_sell_ratio', 'sell_ratio', 'sell_volume',
    'block_mean', 'block_mean_usd', 'blockreward',
    'fees_total', 'fees_transaction_mean', 'fees_transaction_median', 
    'supply_total', 'utxo_count', 'supply_new', 'velocity_supply_total', 'fees_reward_percent', 'hashrate', 'sender_count'
]

# 파생 변수 중에 corr 0.9 이상 제거
drop_cols3 = ['long_liquidation_interest_ratio', 'short_liquidation_interest_ratio', 'volume_interest_ratio', 'buy_sell_volume_ratio', 'volume_index']

In [14]:
# 최종 변수 제거
drop_cols = drop_cols0 + drop_cols1 + drop_cols2 + drop_cols3
df_select = df3.drop(columns = drop_cols)

In [15]:
df_select

Unnamed: 0,coinbase_premium_index,funding_rates,funding_rates_bitmex,long_liquidations,short_liquidations,open_interest,buy_ratio,buy_volume,buy_sell_ratio_huobi,active_count,...,short_liquidation_volume_ratio,market_pressure,network_active,Hodler,profitability,investment,leverage,fee_index,market_health,exchange_center
0,-0.059650,0.005049,0.001400,0.012000,0.000000,6.271344e+09,0.475003,4.648309e+07,4.046784,67987.0,...,0.000000e+00,1.123308e+12,2.326274,5.953640e+17,1.928170,-8.551771,0.033485,1.100307e-07,2.868199e+18,-4.986310e-13
1,-0.053047,0.005049,0.001400,0.000000,0.712000,6.288683e+09,0.623454,3.975599e+07,14.473239,30593.0,...,1.116560e-08,2.061061e+12,0.255973,1.786271e+18,0.470355,-5.524810,11.315818,3.350545e-07,7.251386e+15,1.434318e-13
2,-0.057952,0.005049,0.001400,0.000000,0.000000,6.286796e+09,0.506785,2.405402e+07,3.529514,33897.0,...,0.000000e+00,1.279022e+12,0.923029,8.931563e+17,1.534312,-12.249997,0.000000,1.846562e-07,1.144118e+22,3.011496e-13
3,-0.058912,0.005067,0.001518,0.593000,0.000000,6.284575e+09,0.466518,2.860215e+07,6.477528,32717.0,...,0.000000e+00,1.084253e+12,0.401273,1.429269e+18,0.853960,-14.904649,3.478350,2.150006e-07,1.163878e+16,2.271793e-13
4,-0.061373,0.006210,0.008400,0.361000,0.000000,6.291582e+09,0.491559,3.087720e+07,0.711501,45176.0,...,0.000000e+00,9.791537e+11,0.625902,1.021146e+18,1.067807,-8.208274,1.893557,1.744939e-07,3.695872e+16,5.027650e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,-0.002376,0.038292,0.009616,0.710000,0.243500,1.486836e+10,0.499913,2.681646e+08,0.451684,29250.0,...,4.542147e-10,3.880145e+11,0.204287,8.036011e+19,0.017919,-1.659402,14.307354,6.374078e-06,9.673417e+15,2.435759e-13
11548,-0.018268,0.044642,0.015465,6.577208,0.146000,1.036679e+10,0.499913,2.663238e+08,1.605582,56580.0,...,2.732576e-10,2.320575e+11,1.145499,5.381149e+19,0.040336,-20.965214,23.861224,5.546222e-06,5.307519e+15,-1.303028e-04
11549,0.002866,0.048401,0.017206,1.797163,5.216490,1.078753e+10,0.500765,2.685918e+08,0.783058,51858.0,...,9.715385e-09,2.235498e+11,1.164970,5.396266e+19,0.042930,2.720016,65.536283,4.469784e-06,4.663120e+15,-1.915257e-05
11550,-0.003184,0.039396,0.014390,0.803000,1.656000,1.080558e+10,0.499816,2.691089e+08,1.771881,36270.0,...,3.075452e-09,2.740356e+11,0.615420,8.050545e+19,0.035490,-4.053028,21.967081,4.627599e-06,6.201586e+15,-1.536108e-05


In [16]:
# ma 변수 추가
intervals=[6,12,24,48,72]
ma_col_lst = set(conti_cols)-set(drop_cols)
df_select_ma = generator.moving_average(df_select, ma_col_lst, intervals)

In [17]:
# ccf 에서 상관성이 높았던 변수들 (EDA 참고)
shift_col_lst = ['funding_rates',  'funding_rates_bitmex', 'open_interest', 'transactions_count_total', 'transactions_count_mean', 'block_mean', 'block_mean_usd',
           'fees_total', 'fees_total_usd', 'fees_reward_percent', 'difficulty', 'utxo_count', 'supply_total', 'hashrate', 'fees_transaction_mean',
           'fees_transaction_mean_usd', 'fees_transaction_median', 'fees_transaction_median_usd', 'blockreward_usd', 'tokens_transferred_mean',
           'tokens_transferred_median', 'velocity_supply_total', 'address_diff', 'profitability', 'fee_index'
]

# shift 변수 추가
shift_col_lst = set(shift_col_lst) - set(drop_cols)
df_select_ma_shift = generator.shift_feature(df_select_ma, shift_col_lst, intervals=[_ for _ in range(1, 24)])

In [18]:
df_select_ma_shift.head()

Unnamed: 0,coinbase_premium_index,funding_rates,funding_rates_bitmex,long_liquidations,short_liquidations,open_interest,buy_ratio,buy_volume,buy_sell_ratio_huobi,active_count,...,open_interest_14,open_interest_15,open_interest_16,open_interest_17,open_interest_18,open_interest_19,open_interest_20,open_interest_21,open_interest_22,open_interest_23
0,-0.05965,0.005049,0.0014,0.012,0.0,6271344000.0,0.475003,46483090.0,4.046784,67987.0,...,,,,,,,,,,
1,-0.053047,0.005049,0.0014,0.0,0.712,6288683000.0,0.623454,39755990.0,14.473239,30593.0,...,,,,,,,,,,
2,-0.057952,0.005049,0.0014,0.0,0.0,6286796000.0,0.506785,24054020.0,3.529514,33897.0,...,,,,,,,,,,
3,-0.058912,0.005067,0.001518,0.593,0.0,6284575000.0,0.466518,28602150.0,6.477528,32717.0,...,,,,,,,,,,
4,-0.061373,0.00621,0.0084,0.361,0.0,6291582000.0,0.491559,30877200.0,0.711501,45176.0,...,,,,,,,,,,


In [22]:
df_select_ma_shift.to_csv('../data/train_yh.csv', index = False)