In [34]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [35]:
# 파일 호출
data_path: str = "./data/"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [36]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:11<00:00,  9.18it/s]


In [37]:
df['minus'] = df["hourly_network-data_addresses-count_addresses_count_sender"] - df['hourly_network-data_addresses-count_addresses_count_receiver']
#df = df.drop("hourly_network-data_addresses-count_addresses_count_sender",axis=1)
#df = df.drop('hourly_network-data_addresses-count_addresses_count_receiver',axis=1)

# 'hourly_network-data_fees-transaction_fees_transaction_mean' 만 살리기
fees_col = [ 'hourly_network-data_fees-transaction_fees_transaction_mean_usd',
 'hourly_network-data_fees-transaction_fees_transaction_median',
 'hourly_network-data_fees-transaction_fees_transaction_median_usd',
 'hourly_network-data_fees_fees_block_mean',
 'hourly_network-data_fees_fees_block_mean_usd',
 'hourly_network-data_fees_fees_total',
 'hourly_network-data_fees_fees_total_usd',
 'hourly_network-data_fees_fees_reward_percent']

df = df.drop(columns=fees_col)


In [38]:
df

Unnamed: 0,ID,target,_type,hourly_market-data_coinbase-premium-index_coinbase_premium_gap,hourly_market-data_coinbase-premium-index_coinbase_premium_index,hourly_market-data_funding-rates_all_exchange_funding_rates,hourly_market-data_funding-rates_binance_funding_rates,hourly_market-data_funding-rates_bitmex_funding_rates,hourly_market-data_funding-rates_bybit_funding_rates,hourly_market-data_funding-rates_deribit_funding_rates,...,hourly_network-data_supply_supply_total,hourly_network-data_supply_supply_new,hourly_network-data_tokens-transferred_tokens_transferred_total,hourly_network-data_tokens-transferred_tokens_transferred_mean,hourly_network-data_tokens-transferred_tokens_transferred_median,hourly_network-data_transactions-count_transactions_count_total,hourly_network-data_transactions-count_transactions_count_mean,hourly_network-data_utxo-count_utxo_count,hourly_network-data_velocity_velocity_supply_total,minus
0,2023-01-01 00:00:00,2.0,train,-9.86,-0.059650,0.005049,0.010000,0.001400,0.01,0.000571,...,1.924871e+07,75.00,33057.024011,2.885312,0.020125,11457.0,954.750000,83308092.0,65.978971,-445
1,2023-01-01 01:00:00,1.0,train,-8.78,-0.053047,0.005049,0.010000,0.001400,0.01,0.000570,...,1.924874e+07,25.00,12933.965951,2.217758,0.021293,5832.0,1458.000000,83314883.0,65.977755,-8192
2,2023-01-01 02:00:00,1.0,train,-9.59,-0.057952,0.005049,0.010000,0.001400,0.01,0.000566,...,1.924879e+07,50.00,26960.250177,4.857703,0.022980,5550.0,693.750000,83314090.0,65.975570,-1632
3,2023-01-01 03:00:00,1.0,train,-9.74,-0.058912,0.005067,0.010000,0.001518,0.01,0.000557,...,1.924882e+07,31.25,32747.924338,6.243646,0.024678,5245.0,1049.000000,83326258.0,65.975092,-12378
4,2023-01-01 04:00:00,2.0,train,-10.14,-0.061373,0.006210,0.010000,0.008400,0.01,0.000536,...,1.924886e+07,43.75,26550.999095,3.824690,0.028596,6942.0,991.714286,83339168.0,65.969825,-14392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,-1.53,-0.002376,,0.004867,,,,...,,,33876.505685,3.371468,0.004827,10048.0,3349.333333,179820708.0,15.708903,4553
11548,2024-04-26 04:00:00,,test,-11.73,-0.018268,,0.006169,,,,...,,,51625.600562,3.225794,0.002810,,,179833897.0,15.709585,2224
11549,2024-04-26 05:00:00,,test,1.85,0.002866,,,,,,...,,,22262.243597,1.048326,0.001104,,,179851249.0,15.709207,11989
11550,2024-04-26 06:00:00,,test,-2.05,-0.003184,,,,,,...,,,33349.297426,2.474718,0.001943,,,179852452.0,15.710891,13518


In [39]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 19)

In [40]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
]

In [41]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [42]:
col = ["receiver_count","sender_count"]
df = df.drop(col,axis=1)

In [43]:
# concat 하여 df 에 할당
df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [44]:
duplicate_cols = x_train.columns[x_train.columns.duplicated()]
print(duplicate_cols)


Index(['coinbase_premium_gap_1', 'coinbase_premium_gap_2',
       'coinbase_premium_gap_3', 'coinbase_premium_gap_4',
       'coinbase_premium_gap_5', 'coinbase_premium_gap_6',
       'coinbase_premium_gap_7', 'coinbase_premium_gap_8',
       'coinbase_premium_gap_9', 'coinbase_premium_gap_10',
       ...
       'volume_diff_14', 'volume_diff_15', 'volume_diff_16', 'volume_diff_17',
       'volume_diff_18', 'volume_diff_19', 'volume_diff_20', 'volume_diff_21',
       'volume_diff_22', 'volume_diff_23'],
      dtype='object', length=460)


In [45]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
#y_valid_pred = lgb_model.predict(x_valid)
# lgb predict with force_col_wise
y_valid_pred = lgb_model.predict(x_valid, force_col_wise=True)

y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



You can set `force_col_wise=true` to remove the overhead.
acc: 0.4223744292237443, auroc: 0.6100381650701248


In [46]:
# 전체 학습 데이터로 모델 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)

# 모델 학습
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)




You can set `force_col_wise=true` to remove the overhead.


In [47]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

In [49]:
import pandas as pd
import lightgbm as lgb

# 변수 중요도 추출
feature_importances = lgb_model.feature_importance()
feature_names = x_train.columns

# 중요도와 변수 이름을 데이터프레임으로 변환
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# 중요도를 기준으로 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 상위 20개 변수
top_20_features = importance_df.head(20)

# 하위 20개 변수
bottom_20_features = importance_df.tail(20)

print("상위 20개 변수:")
print(top_20_features)

print("\n하위 20개 변수:")
print(bottom_20_features)


상위 20개 변수:
               Feature  Importance
12         sell_volume          94
10          buy_volume          62
478     volume_diff_21          49
374     sender_count_9          48
16         volume_diff          44
378    sender_count_13          43
360  receiver_count_18          41
8            buy_ratio          41
354  receiver_count_12          35
383    sender_count_18          35
251       buy_volume_1          33
297      sell_volume_1          32
351   receiver_count_9          32
362  receiver_count_20          31
470     volume_diff_13          31
349   receiver_count_7          30
358  receiver_count_16          27
255       buy_volume_5          26
474     volume_diff_17          26
357  receiver_count_15          26

하위 20개 변수:
                      Feature  Importance
393   buy_sell_volume_ratio_5           0
395   buy_sell_volume_ratio_7           0
405  buy_sell_volume_ratio_17           0
243         buy_sell_ratio_16           0
411  buy_sell_volume_ratio_23   

In [50]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# 변수 중요도 추출
feature_importances = lgb_model.feature_importance()
feature_names = x_train.columns

# 중요도와 변수 이름을 데이터프레임으로 변환
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# 중요도를 기준으로 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 하위 30개 변수 선택
bottom_30_features = importance_df.tail(30)['Feature']

# 하위 30개 변수 제거
x_train_reduced = x_train.drop(columns=bottom_30_features)
x_valid_reduced = x_valid.drop(columns=bottom_30_features)

# 데이터셋 생성
train_data_reduced = lgb.Dataset(x_train_reduced, label=y_train)
valid_data_reduced = lgb.Dataset(x_valid_reduced, label=y_valid, reference=train_data_reduced)

# 모델 파라미터
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# 모델 학습
lgb_model_reduced = lgb.train(
    params=params,
    train_set=train_data_reduced,
    valid_sets=valid_data_reduced,
)

# 예측
y_valid_pred_reduced = lgb_model_reduced.predict(x_valid_reduced, force_col_wise=True)
y_valid_pred_class_reduced = np.argmax(y_valid_pred_reduced, axis=1)

# 성능 평가
accuracy_reduced = accuracy_score(y_valid, y_valid_pred_class_reduced)
auroc_reduced = roc_auc_score(y_valid, y_valid_pred_reduced, multi_class="ovr")

print(f"Revised model - Accuracy: {accuracy_reduced}, AUROC: {auroc_reduced}")




You can set `force_col_wise=true` to remove the overhead.
Revised model - Accuracy: 0.8247716894977168, AUROC: 0.974651654894387


In [53]:
# 전체 학습 데이터로 모델 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)

# 모델 학습
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)




You can set `force_col_wise=true` to remove the overhead.


In [55]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# 전체 데이터셋 로드
x_train_full = train_df.drop(["target", "ID"], axis=1)
y_train_full = train_df["target"].astype(int)

# 하위 30개 변수 선택
feature_importances = lgb_model.feature_importance()
feature_names = x_train.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
bottom_30_features = importance_df.tail(30)['Feature']

# 하위 30개 변수 제거
x_train_reduced_full = x_train_full.drop(columns=bottom_30_features)

# 데이터셋 생성
train_data_reduced_full = lgb.Dataset(x_train_reduced_full, label=y_train_full)

# 모델 파라미터
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# 모델 학습
lgb_model_reduced_full = lgb.train(
    params=params,
    train_set=train_data_reduced_full,
)

# 예측
# train_df에서 ID 컬럼만 제거하고 동일한 변수로 예측
x_train_full_reduced = x_train_full.drop(columns=bottom_30_features)
y_train_pred_reduced_full = lgb_model_reduced_full.predict(x_train_full_reduced, force_col_wise=True)
y_train_pred_class_reduced_full = np.argmax(y_train_pred_reduced_full, axis=1)

# 성능 평가 (학습 데이터에 대한 평가)
accuracy_reduced_full = accuracy_score(y_train_full, y_train_pred_class_reduced_full)
auroc_reduced_full = roc_auc_score(y_train_full, y_train_pred_reduced_full, multi_class="ovr")

print(f"Revised model on full train data - Accuracy: {accuracy_reduced_full}, AUROC: {auroc_reduced_full}")




You can set `force_col_wise=true` to remove the overhead.
Revised model on full train data - Accuracy: 0.8252283105022831, AUROC: 0.9750699513496484


In [61]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# 전체 학습 데이터로 모델을 학습할 때와 동일한 feature set을 유지하도록 조정
# 하위 30개 변수를 제거한 후 모델 재학습
x_train_full = train_df.drop(["target", "ID"], axis=1)
y_train_full = train_df["target"].astype(int)

# 모델 학습
lgb_model_reduced_full = lgb.train(
    params=params,
    train_set=lgb.Dataset(x_train_full.drop(columns=bottom_30_features), label=y_train_full),
)

# 테스트 데이터에 대해 예측 수행
x_test = test_df.drop(["target", "ID"], axis=1)

# 테스트 데이터에서 하위 30개 변수 제거
x_test_reduced = x_test.drop(columns=bottom_30_features, errors='ignore')

# 예측
y_test_pred = lgb_model_reduced_full.predict(x_test_reduced, force_col_wise=True)
y_test_pred_class = np.argmax(y_test_pred, axis=1)

# 예측 결과 확인
print("Test predictions (first 10 samples):")
print(y_test_pred_class[:10])




You can set `force_col_wise=true` to remove the overhead.
Test predictions (first 10 samples):
[1 1 1 1 2 2 1 1 1 1]


In [63]:
len(y_test_pred_class)

2792

In [74]:
pd.DataFrame(y_test_pred_class).value_counts()

2    1701
1    1048
3      30
0      13
dtype: int64

In [67]:
len(df[df['_type']=='test'])

2792

In [75]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['target'] = y_test_pred_class
submission

Unnamed: 0,ID,target
0,2024-01-01 00:00:00,1
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,1
3,2024-01-01 03:00:00,1
4,2024-01-01 04:00:00,2
...,...,...
2787,2024-04-26 03:00:00,1
2788,2024-04-26 04:00:00,2
2789,2024-04-26 05:00:00,1
2790,2024-04-26 06:00:00,1


In [76]:
submission.to_csv('submision1.csv')