### Library Import

In [239]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Data Load

In [240]:
# 파일 호출
root_path = '/data/ephemeral/home/level1-classificationinmachinelearning-recsys-01'
data_path: str = os.path.join(root_path, 'data')
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train 
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [241]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}
for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


100%|██████████| 107/107 [00:03<00:00, 34.17it/s]


# 데이터 전처리

## 결측 100% column 제거

In [242]:
# 각 열에서 누락된 값의 수를 계산
missing_values = df.isnull().sum()

# 누락된 값의 백분율 계산
missing_percentage = (missing_values / len(df)) * 100

null_columns = missing_percentage[missing_percentage != 100].keys()
df = df.loc[:,null_columns]
df

Unnamed: 0,ID,target,_type,hourly_network-data_block-count_block_count,hourly_market-data_funding-rates_deribit_funding_rates,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations,hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations_usd,hourly_market-data_open-interest_bybit_btc_usd_open_interest,...,hourly_market-data_liquidations_gate_io_btc_usdt_long_liquidations_usd,hourly_market-data_liquidations_gate_io_btc_usdt_short_liquidations_usd,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_volume,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_sell_ratio,hourly_market-data_taker-buy-sell-stats_htx_global_taker_buy_sell_ratio,hourly_network-data_addresses-count_addresses_count_active,hourly_network-data_addresses-count_addresses_count_sender,hourly_network-data_addresses-count_addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,0.0,0.0,0.0,0.0,379138258.0,...,0.0,0.0,415200.0,102600.0,0.801854,0.198146,4.046784,67987,37307,37752
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,0.0,0.0,0.0,0.0,382072537.0,...,0.0,0.0,1027600.0,71000.0,0.935372,0.064628,14.473239,30593,12342,20534
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,0.0,0.0,0.0,0.0,381636197.0,...,0.0,0.0,406600.0,115200.0,0.779226,0.220774,3.529514,33897,17737,19369
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,0.0,0.0,0.0,0.0,382229253.0,...,0.0,0.0,922400.0,142400.0,0.866266,0.133734,6.477528,32717,11421,23799
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,0.0,0.0,0.0,0.0,385126773.0,...,0.0,0.0,73000.0,102600.0,0.415718,0.584282,0.711501,45176,17320,31712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,,0.0,0.0,0.0,0.0,974276825.0,...,,,86000.0,203800.0,0.296756,0.703244,0.421982,29250,18154,13601
11548,2024-04-26 04:00:00,,test,,,0.0,0.0,0.0,0.0,970952780.0,...,,,382200.0,381000.0,0.500786,0.499214,1.003150,56580,31320,29096
11549,2024-04-26 05:00:00,,test,,,0.0,0.0,0.0,0.0,970067075.0,...,,,,,,,,51858,34083,22094
11550,2024-04-26 06:00:00,,test,,,0.0,0.0,0.0,0.0,972346702.0,...,,,,,,,,36270,26186,12668


## 결측치 처리 - interpolation

In [243]:
original_dict = dict(df.isnull().sum())
filtered_dict = {key: value for key, value in original_dict.items() if value != 0}
filtered_dict

{'target': 2792,
 'hourly_network-data_block-count_block_count': 4,
 'hourly_market-data_funding-rates_deribit_funding_rates': 33,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations': 2,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations': 2,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_long_liquidations_usd': 2,
 'hourly_market-data_liquidations_bitfinex_btc_usdt_short_liquidations_usd': 2,
 'hourly_market-data_open-interest_bybit_btc_usd_open_interest': 1,
 'hourly_network-data_fees-transaction_fees_transaction_mean': 24,
 'hourly_network-data_fees-transaction_fees_transaction_mean_usd': 24,
 'hourly_market-data_liquidations_bitmex_btc_usd_long_liquidations': 5,
 'hourly_market-data_liquidations_bitmex_btc_usd_short_liquidations': 5,
 'hourly_market-data_liquidations_bitmex_btc_usd_long_liquidations_usd': 5,
 'hourly_market-data_liquidations_bitmex_btc_usd_short_liquidations_usd': 5,
 'hourly_network-data_block-interval_block_interval': 2

In [244]:
# # 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.interpolate(method='linear').assign(target = _target)

  df = df.interpolate(method='linear').assign(target = _target)


## columns 이름 단순화

In [245]:
original_dic = {'ID' : 'ID', 'target' : 'target', '_type' : '_type'}
renamed_dic = {}
for column in df.columns[3:]:
    renamed_col = '_'.join(column.split('_')[3:])
    renamed_dic[column] = renamed_col
original_dic.update(renamed_dic)

In [246]:
df = df[original_dic.keys()].rename(original_dic, axis=1)
df

Unnamed: 0,ID,target,_type,block_count,deribit_funding_rates,bitfinex_btc_usdt_long_liquidations,bitfinex_btc_usdt_short_liquidations,bitfinex_btc_usdt_long_liquidations_usd,bitfinex_btc_usdt_short_liquidations_usd,bybit_btc_usd_open_interest,...,gate_io_btc_usdt_long_liquidations_usd,gate_io_btc_usdt_short_liquidations_usd,htx_global_taker_buy_volume,htx_global_taker_sell_volume,htx_global_taker_buy_ratio,htx_global_taker_sell_ratio,htx_global_taker_buy_sell_ratio,addresses_count_active,addresses_count_sender,addresses_count_receiver
0,2023-01-01 00:00:00,2.0,train,12.0,0.000571,0.0,0.0,0.0,0.0,379138258.0,...,0.0,0.00000,415200.0,102600.0,0.801854,0.198146,4.046784,67987,37307,37752
1,2023-01-01 01:00:00,1.0,train,4.0,0.000570,0.0,0.0,0.0,0.0,382072537.0,...,0.0,0.00000,1027600.0,71000.0,0.935372,0.064628,14.473239,30593,12342,20534
2,2023-01-01 02:00:00,1.0,train,8.0,0.000566,0.0,0.0,0.0,0.0,381636197.0,...,0.0,0.00000,406600.0,115200.0,0.779226,0.220774,3.529514,33897,17737,19369
3,2023-01-01 03:00:00,1.0,train,5.0,0.000557,0.0,0.0,0.0,0.0,382229253.0,...,0.0,0.00000,922400.0,142400.0,0.866266,0.133734,6.477528,32717,11421,23799
4,2023-01-01 04:00:00,2.0,train,7.0,0.000536,0.0,0.0,0.0,0.0,385126773.0,...,0.0,0.00000,73000.0,102600.0,0.415718,0.584282,0.711501,45176,17320,31712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,,test,3.0,-0.001519,0.0,0.0,0.0,0.0,974276825.0,...,0.0,170655.56551,86000.0,203800.0,0.296756,0.703244,0.421982,29250,18154,13601
11548,2024-04-26 04:00:00,,test,3.0,-0.001519,0.0,0.0,0.0,0.0,970952780.0,...,0.0,170655.56551,382200.0,381000.0,0.500786,0.499214,1.003150,56580,31320,29096
11549,2024-04-26 05:00:00,,test,3.0,-0.001519,0.0,0.0,0.0,0.0,970067075.0,...,0.0,170655.56551,382200.0,381000.0,0.500786,0.499214,1.003150,51858,34083,22094
11550,2024-04-26 06:00:00,,test,3.0,-0.001519,0.0,0.0,0.0,0.0,972346702.0,...,0.0,170655.56551,382200.0,381000.0,0.500786,0.499214,1.003150,36270,26186,12668


# 상관관계 높은 것들끼리 묶어서 차원 축소

In [247]:
def find_high_corr_groups(correlation_matrix, threshold=0.8):
    G = nx.Graph()
    for col1 in correlation_matrix.columns:
        for col2 in correlation_matrix.columns:
            if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > threshold:
                G.add_edge(col1, col2)
    groups = list(nx.connected_components(G))
    return [list(group) for group in groups]

In [248]:
correlation_matrix = df.iloc[:,3:].corr()
groups = find_high_corr_groups(correlation_matrix, threshold=0.8)
groups

[['addresses_count_active',
  'supply_new',
  'hashrate',
  'blockreward',
  'addresses_count_sender',
  'blockreward_usd',
  'block_count'],
 ['deribit_funding_rates', 'all_exchange_funding_rates'],
 ['bitfinex_btc_usdt_long_liquidations_usd',
  'bitfinex_btc_usdt_long_liquidations',
  'bitfinex_all_symbol_long_liquidations',
  'bitfinex_all_symbol_long_liquidations_usd'],
 ['bitfinex_btc_usdt_short_liquidations',
  'bitfinex_all_symbol_short_liquidations_usd',
  'bitfinex_all_symbol_short_liquidations',
  'bitfinex_btc_usdt_short_liquidations_usd'],
 ['bybit_btc_usdt_open_interest',
  'bitfinex_all_symbol_open_interest',
  'binance_all_symbol_open_interest',
  'binance_btc_busd_open_interest',
  'utxo_count',
  'bybit_all_symbol_open_interest',
  'velocity_supply_total',
  'binance_btc_usdt_open_interest',
  'bitmex_btc_usd_open_interest',
  'okx_btc_usdt_open_interest',
  'gate_io_btc_usdt_open_interest',
  'bitfinex_btc_usdt_open_interest',
  'all_exchange_all_symbol_open_interest'

In [249]:
def apply_pca_to_groups(df, groups, n_components=1):
    grouped_list = [word for group in groups for word in group]
    non_grouped_list = list(set(df.columns) - set(grouped_list))
    
    X = df[non_grouped_list].drop(['ID', '_type', 'target'], axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    non_grouped_df = pd.DataFrame(X_scaled, columns = X.columns)

    pca_results = []
    for idx, group in enumerate(groups):
        if len(group) > 1:  # Apply PCA only if the group has more than one variable
            X = df[group]
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            pca = PCA(n_components=n_components)
            pca_result = pca.fit_transform(X_scaled)
            # Add PCA result to list
            pca_results.append(pd.DataFrame(pca_result, columns = [f"pca_{idx}"]))
    
    grouped_df = pd.concat(pca_results, axis=1)
    fixed_df = df[['ID', '_type', 'target']]

    concat_df = pd.concat([fixed_df, non_grouped_df, grouped_df], axis=1)
    return concat_df

In [250]:
df = apply_pca_to_groups(df, groups, n_components=1)

In [251]:
df

Unnamed: 0,ID,_type,target,block_bytes,tokens_transferred_median,okx_btc_usd_open_interest,block_interval,gate_io_btc_usd_open_interest,binance_funding_rates,bitmex_taker_buy_sell_ratio,...,pca_35,pca_36,pca_37,pca_38,pca_39,pca_40,pca_41,pca_42,pca_43,pca_44
0,2023-01-01 00:00:00,train,2.0,-3.120704,0.720442,1.725523,-0.665114,-0.105341,-0.013715,-0.082980,...,-0.103446,-0.051419,-0.840622,-3.477198,0.522937,-0.418894,-0.552640,-0.175332,-0.212792,0.993011
1,2023-01-01 01:00:00,train,1.0,-2.752206,0.790489,1.727773,0.207934,-0.105190,-0.013715,0.823876,...,-0.103446,-0.051419,-0.426978,-5.032325,2.733126,-0.418894,-0.552640,-0.175332,-0.212792,3.255304
2,2023-01-01 02:00:00,train,1.0,-3.895279,0.891629,1.730125,-0.818031,-0.104435,-0.013715,0.164731,...,-0.103446,-0.051419,-0.838217,-3.213640,0.413287,-0.418894,-0.552640,-0.175332,-0.212792,-1.720674
3,2023-01-01 03:00:00,train,1.0,-3.612864,0.993442,1.729217,-0.074014,-0.102025,-0.013715,0.008826,...,-0.103446,-0.051419,-0.453460,-4.227423,1.038204,-0.418894,-0.552640,-0.175332,-0.212792,0.984722
4,2023-01-01 04:00:00,train,2.0,-3.364117,1.228390,1.728728,0.012512,-0.102059,-0.013715,-0.212271,...,-0.103446,-0.051419,-1.083691,1.020247,-0.184072,-0.418894,-0.552640,-0.175332,-0.212792,-0.532362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11547,2024-04-26 03:00:00,test,,-0.399125,-0.196768,1.904656,0.484475,-0.526173,-0.450572,-0.208589,...,-0.103446,-0.051419,-0.823249,3.053547,-0.266057,-0.418894,-0.552640,-0.175332,-0.212792,-0.444267
11548,2024-04-26 04:00:00,test,,-0.399125,-0.317740,1.904656,-0.001417,-0.526173,-0.339789,-0.208589,...,-0.103446,-0.051419,-0.658450,1.863263,-0.204458,-0.412026,-0.552640,-0.175332,-0.212792,-1.516583
11549,2024-04-26 05:00:00,test,,-0.399125,-0.420045,1.904656,-0.098104,-0.526173,-0.339789,-0.208589,...,-0.103446,-0.051419,-0.658450,1.863263,-0.204458,-0.405183,-0.472657,-0.175332,-0.212792,-0.213868
11550,2024-04-26 06:00:00,test,,-0.399125,-0.369692,1.904656,-0.484031,-0.526173,-0.339789,-0.208589,...,-0.103446,-0.051419,-0.658450,1.863263,-0.204458,-0.418894,-0.552640,-0.175332,-0.212792,0.216733


In [252]:

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### Model Training

In [253]:
# train_test_split 으로 valid set, train set 분리
x_train, x_valid, y_train, y_valid = train_test_split(
    train_df.drop(["target", "ID"], axis = 1), 
    train_df["target"].astype(int), 
    test_size=0.2,
    random_state=42,
)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_valid, label=y_valid, reference=train_data)

# lgb params
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 0,
}

# lgb train
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=valid_data,
)

# lgb predict
y_valid_pred = lgb_model.predict(x_valid)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = roc_auc_score(y_valid, y_valid_pred, multi_class="ovr")

print(f"acc: {accuracy}, auroc: {auroc}")



acc: 0.442351598173516, auroc: 0.6507407778404106


In [16]:
# performance 체크후 전체 학습 데이터로 다시 재학습
x_train = train_df.drop(["target", "ID"], axis = 1)
y_train = train_df["target"].astype(int)
train_data = lgb.Dataset(x_train, label=y_train)
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
)


Found `n_estimators` in params. Will use it instead of argument



### Inference

In [17]:
# lgb predict
y_test_pred = lgb_model.predict(test_df.drop(["target", "ID"], axis = 1))
y_test_pred_class = np.argmax(y_test_pred, axis = 1)

### Output File Save

In [18]:
# output file 할당후 save 
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)