In [521]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [522]:
# 파일 호출
data_path: str = "./data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [523]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:00<00:00, 122.70it/s]


In [524]:
import random
import torch
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
seed=42
set_seed(seed)

In [525]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
}
df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

(11552, 19)

In [526]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "buy_sell_volume_ratio",
    "liquidation_diff",
    "liquidation_usd_diff",
    "volume_diff",
]

In [527]:
df["sub_target"] = df["target"].apply(lambda x: 1 if x >= 2 else 0)

In [528]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    # future shift df
    # df_future_shift_dict = [
    #     df[conti_col].shift(-interval).rename(f"{conti_col}_{-interval}")
    #     for conti_col in conti_cols
    #     for interval in intervals
    # ]
    # df_shift_dict.extend(df_future_shift_dict)
    
    return df_shift_dict

# test에서 shift 된 것은 사용할 수 없기 때문에 test에서 shift하기 전에 train, test를 분리
train_df: pd.DataFrame = df.loc[df["_type"] == "train"]
test_df: pd.DataFrame = df.loc[df["_type"] == "test"]
train_df = train_df.drop("_type", axis=1)
test_df = test_df.drop("_type", axis=1)

train_shift_list = shift_feature(
    df=train_df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)
test_shift_list = shift_feature(
    df=test_df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)


In [529]:
# concat 하여 df 에 할당
train_df = pd.concat([train_df, pd.concat(train_shift_list, axis=1)], axis=1)
test_df = pd.concat([test_df, pd.concat(test_shift_list, axis=1)], axis=1)

_target = train_df["target"]
train_df = train_df.ffill().fillna(-999).assign(target = _target)

In [530]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID", "sub_target"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
    stratify=train_df["target"],
)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
# params = {
#     "boosting_type": "gbdt",
#     "objective": "multiclass",
#     "metric": "multi_logloss",
#     "num_class": 4,
#     "num_leaves": 12,
#     "learning_rate": 0.05,
#     "n_estimators": 7,
#     "random_state": 42,
#     "verbose": 0,
# }
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
print(y_valid_pred)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



[[0.04772351 0.36522757 0.52773973 0.05930919]
 [0.07512526 0.46812406 0.40794568 0.048805  ]
 [0.06298173 0.42031876 0.44661894 0.07008058]
 ...
 [0.09574566 0.36012216 0.44876471 0.09536747]
 [0.06425714 0.42336402 0.37548182 0.13689702]
 [0.04171443 0.51829904 0.35301667 0.08696986]]
acc: 0.4280821917808219, auroc: 0.5987788886723954


In [531]:
# test predict
test_df = test_df.drop(["target", "ID", "sub_target"], axis = 1)
test_pred = lgb_model.predict(test_df)
test_pred_class = np.argmax(test_pred, axis = 1)
submission_df["target"] = test_pred_class
submission_df["target"].value_counts()

target
2    1810
1     948
3      29
0       5
Name: count, dtype: int64

In [534]:
# train_test_split 으로 valid set, train set 분리
x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
    train_df.drop(["target", "ID", "sub_target"], axis = 1), 
    train_df["sub_target"].astype(int), 
    test_size=0.2,
    random_state=42,
    stratify=train_df["sub_target"],
)

# lgb dataset
train_data_split = lgb.Dataset(x_train_split, label=y_train_split)
valid_data_split = lgb.Dataset(x_valid_split, label=y_valid_split, reference=train_data_split)

# lgb params
# params_split = {
#     "boosting_type": "gbdt",
#     # "objective": "multiclass",
#     'objective': 'binary',
#     "metric": "binary_logloss",
#     # "metric": "multi_logloss",
#     # "num_class": 4,
#     "num_leaves": 12,
#     "learning_rate": 0.05,
#     "n_estimators": 7,
#     "random_state": 42,
#     "verbose": 0,
# }
params_split = {
    "boosting_type": "gbdt",
    'objective': 'binary',
   "metric": "binary_logloss",
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model_split = lgb.train(
    params=params_split,
    train_set=train_data_split,
    valid_sets=valid_data_split,
)

# lgb predict
y_valid_pred_split = lgb_model_split.predict(x_valid_split)
# y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)
# print(y_valid_pred[:20])
y_valid_pred_4 = np.array([[1-p, 1-p, p, p] for p in y_valid_pred_split])
print(y_valid_pred_4)
y_valid_pred_class_split = np.where(y_valid_pred_split > 0.5, 1, 0)
# score check
accuracy_split = accuracy_score(y_valid_split, y_valid_pred_class_split)
# auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

# print(f"acc: {accuracy}, auroc: {auroc}")
print(accuracy_split)



[[0.53998006 0.53998006 0.46001994 0.46001994]
 [0.45564541 0.45564541 0.54435459 0.54435459]
 [0.50229933 0.50229933 0.49770067 0.49770067]
 ...
 [0.45773468 0.45773468 0.54226532 0.54226532]
 [0.54714097 0.54714097 0.45285903 0.45285903]
 [0.47944265 0.47944265 0.52055735 0.52055735]]
0.5188356164383562


In [535]:
# lgb, lgb_split ensemble coef search
best_acc = 0
best_coef = 0
for coef in np.linspace(0, 1, 10000):
    y_valid_pred_ensemble = y_valid_pred * coef + y_valid_pred_4 * (1 - coef)
    y_valid_pred_class = np.argmax(y_valid_pred_ensemble, axis = 1)
    accuracy = accuracy_score(y_valid, y_valid_pred_class)
    if accuracy > best_acc:
        best_acc = accuracy
        best_coef = coef
print(best_acc, best_coef)


0.4394977168949772 0.43994399439944


In [536]:
# test predict and submission
# test_df = test_df.drop(["target", "ID", "sub_target"], axis = 1)
test_pred = lgb_model.predict(test_df)
test_pred_split = lgb_model_split.predict(test_df)
test_pred_4 = np.array([[1-p, 1-p, p, p] for p in test_pred_split])
test_pred_ensemble = test_pred * best_coef + test_pred_4 * (1 - best_coef)
test_pred_class = np.argmax(test_pred_ensemble, axis = 1)
submission_df["target"] = test_pred_class
submission_df.to_csv("./split_model_ensemble.csv", index=False)


In [537]:
submission_df["target"].value_counts()

target
2    1670
1    1100
3      17
0       5
Name: count, dtype: int64