# Train - lgb

In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import torch
import lightgbm as lgb

from glob import glob
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')




def random_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    
    return True
random_seed(seed=42)

True

In [2]:
''' 
train_df = train_label + train_data
test_df = test_label + test_data

all_df = all_label + all_data
'''

dir_path = "/data/ephemeral/home/data"

data_path = "./data.csv"
all_df = pd.read_csv(data_path)

train_df = all_df.loc[all_df["_type"]=="train"].drop(columns=["_type"])
test_df = all_df.loc[all_df["_type"]=="test"].drop(columns=["_type"])
print("data shape: ", all_df.shape)
print("train shape: ", train_df.shape)
print("test shape: ", test_df.shape)

data shape:  (11552, 509)
train shape:  (8760, 508)
test shape:  (2792, 508)


In [3]:
train_df.head()

Unnamed: 0,ID,target,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,long_liquidations_usd,short_liquidations,short_liquidations_usd,open_interest,...,active_count_shift_10,receiver_count_shift_10,sender_count_shift_10,liquidation_diff_shift_10,liquidation_usd_diff_shift_10,volume_diff_shift_10,liquidation_diffg_shift_10,liquidation_usd_diffg_shift_10,volume_diffg_shift_10,buy_sell_volume_ratio_shift_10
0,2023-01-01 00:00:00,2.0,-9.86,-0.05965,0.005049,0.012,197.5161,0.0,0.0,6271344000.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,2023-01-01 01:00:00,1.0,-8.78,-0.053047,0.005049,0.0,0.0,0.712,11833.56104,6288683000.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,2023-01-01 02:00:00,1.0,-9.59,-0.057952,0.005049,0.0,0.0,0.0,0.0,6286796000.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,2023-01-01 03:00:00,1.0,-9.74,-0.058912,0.005067,0.593,9754.76891,0.0,0.0,6284575000.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,2023-01-01 04:00:00,2.0,-10.14,-0.061373,0.00621,0.361,5944.43714,0.0,0.0,6291582000.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [4]:
test_df.head()

Unnamed: 0,ID,target,coinbase_premium_gap,coinbase_premium_index,funding_rates,long_liquidations,long_liquidations_usd,short_liquidations,short_liquidations_usd,open_interest,...,active_count_shift_10,receiver_count_shift_10,sender_count_shift_10,liquidation_diff_shift_10,liquidation_usd_diff_shift_10,volume_diff_shift_10,liquidation_diffg_shift_10,liquidation_usd_diffg_shift_10,volume_diffg_shift_10,buy_sell_volume_ratio_shift_10
8760,2024-01-01 00:00:00,,-22.57,-0.053137,0.042276,0.661,27996.83627,17.251477,735019.5,10345540000.0,...,80380.0,25484.0,58539.0,-3.275801,-139375.3,63116770.0,-1.0,-1.0,1.0,1.475138
8761,2024-01-01 01:00:00,,-18.88,-0.044305,0.042578,0.483,20552.07715,36.152847,1546929.0,10401650000.0,...,66899.0,17439.0,51709.0,1.924824,80879.24,-47173190.0,1.0,1.0,-1.0,0.701483
8762,2024-01-01 02:00:00,,-9.78,-0.022968,0.043265,6.724,285861.94051,1.623,69411.07,10395940000.0,...,65812.0,15247.0,52548.0,35.532924,1493391.0,-62402640.0,1.0,1.0,-1.0,0.740465
8763,2024-01-01 03:00:00,,-5.38,-0.01271,0.043927,20.10708,849377.45559,0.071,3003.284,10337490000.0,...,41020.0,15500.0,26760.0,-0.651,-27879.25,37785800.0,-1.0,-1.0,1.0,1.495168
8764,2024-01-01 04:00:00,,-10.22,-0.024104,0.045189,12.654933,533700.00186,1.651,70143.76,10392050000.0,...,57263.0,20099.0,38653.0,-5.0008,-212232.0,23436550.0,-1.0,-1.0,1.0,1.234126


In [5]:
y = train_df['target']    # fetures
x = train_df.drop(['ID', 'target'], axis=1)    # target

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

# lgb dataset
train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val, reference=train_data)

params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_class": 4,
    "num_leaves": 50,
    "learning_rate": 0.05,
    "n_estimators": 30,
    "random_state": 42,
    "verbose": 1,
}

# Train
model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=val_data,
)

# Val
y_valid_pred = model.predict(x_val)
y_valid_pred_class = np.argmax(y_valid_pred, axis = 1)

accuracy = accuracy_score(y_val, y_valid_pred_class)
print(f"acc: {accuracy}")

(7008, 506) (7008,)
(1752, 506) (1752,)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 109039
[LightGBM] [Info] Number of data points in the train set: 7008, number of used features: 483
[LightGBM] [Info] Start training from score -2.471301
[LightGBM] [Info] Start training from score -0.905010
[LightGBM] [Info] Start training from score -0.869664
[LightGBM] [Info] Start training from score -2.387109
acc: 0.4474885844748858


In [6]:
test_data = test_df.drop(["target", "ID"], axis=1)

y_test_pred = model.predict(test_data)
y_test_pred_class = np.argmax(y_test_pred, axis=1)

submission_df = pd.read_csv(f"{dir_path}/test.csv")
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv("output.csv", index=False)

print(submission_df.target.value_counts())
submission_df

target
2    1728
1     995
0      44
3      25
Name: count, dtype: int64


Unnamed: 0,ID,target
0,2024-01-01 00:00:00,2
1,2024-01-01 01:00:00,1
2,2024-01-01 02:00:00,2
3,2024-01-01 03:00:00,2
4,2024-01-01 04:00:00,2
...,...,...
2787,2024-04-26 03:00:00,1
2788,2024-04-26 04:00:00,1
2789,2024-04-26 05:00:00,1
2790,2024-04-26 06:00:00,1
