In [1]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score  # or accuracy_score for classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy.stats import pearsonr

In [2]:
train_df = pd.read_parquet("../data/train_clean_v3.parquet")
test_df = pd.read_parquet("../data/test_clean_v3.parquet")
test_df = test_df.reset_index()

In [3]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

nan_cols = train_df.columns[train_df.isna().sum() > 0].tolist()

train_df.drop(columns=nan_cols, inplace=True)
test_df.drop(columns=nan_cols, inplace=True)

In [4]:
constant_cols = [col for col in train_df.columns if train_df[col].nunique() == 1]

train_df.drop(columns=constant_cols, inplace=True)
test_df.drop(columns=constant_cols, inplace=True)

In [5]:
skew_vals = train_df.skew(numeric_only=True)
skewed_cols = skew_vals[skew_vals.abs() > 5].index.tolist()
print(skewed_cols)

# train_df.drop(columns=skewed_cols, inplace=True)
# test_df.drop(columns=skewed_cols, inplace=True)

['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X229', 'X230', 'X231', 'X232', 'X233', 'X234', 'X236', 'X241', 'X478', 'X479', 'X495', 'X501', 'X502', 'X507', 'X508', 'X514', 'X515', 'X521', 'X522', 'X544', 'X549', 'X551', 'X594', 'X595', 'X596', 'X597', 'X615', 'X618', 'X621', 'X624', 'X627', 'X630', 'X633', 'X636', 'X639', 'X642', 'X645', 'X648', 'X654', 'X660', 'X666', 'X672', 'X678', 'X684', 'X690', 'X696', 'X850', 'X873', 'X874', 'X877', 'X878']


In [6]:
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

train_df.to_parquet("../data/train_clean_v3.parquet")
test_df.to_parquet("../data/test_clean_v3.parquet")

In [7]:
train_df.describe()

Unnamed: 0,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,...,X882,X883,X884,X885,X886,X887,X888,X889,X890,label
count,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,...,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0
mean,9.968003,10.17417,131.726639,132.674149,264.400177,-0.006026,-0.000243,-0.000353,-0.000424,-0.000263,...,1.95933,1.954978,0.498085,0.498037,0.497974,0.497855,0.497525,0.497372,0.995394,0.036126
std,15.645656,15.889657,307.265228,309.806061,588.615845,0.538341,0.613745,0.771744,0.857115,0.464695,...,0.844731,0.598278,0.712772,0.698641,0.684912,0.66691,0.61544,0.546439,0.850509,1.009914
min,0.001,0.001,0.0,0.0,0.0,-2.787109,-5.863281,-6.125,-6.09375,-2.855469,...,0.43335,0.686523,4e-06,6e-06,9e-06,2.4e-05,0.003902,0.015656,0.092896,-24.421875
25%,2.634766,2.677734,26.40625,27.015625,60.6875,-0.370605,-0.314209,-0.44165,-0.521484,-0.29541,...,1.367188,1.521484,0.000162,0.000193,0.000241,0.000997,0.033142,0.066895,0.244385,-0.381592
50%,6.414062,6.539062,57.0,58.0625,120.8125,-0.015991,-0.001713,-0.00301,-0.003948,-0.002544,...,1.832031,1.882812,0.000634,0.001036,0.004593,0.039185,0.127808,0.161743,0.634277,0.016266
75%,13.085938,13.328125,127.625,129.125,256.75,0.349121,0.310303,0.431885,0.507812,0.287842,...,2.369141,2.273438,1.473633,1.47168,1.442383,1.379883,1.120117,0.999512,1.702148,0.434082
max,1115.0,1353.0,17616.0,17680.0,28704.0,2.972656,6.152344,6.488281,6.394531,3.185547,...,6.882812,4.746094,1.583984,1.583984,1.583984,1.583984,1.583984,1.557617,2.769531,20.734375
