In [1]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score  # or accuracy_score for classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy.stats import pearsonr

In [2]:
train_df = pd.read_parquet("../data/train_clean.parquet")
test_df = pd.read_parquet("../data/test_clean.parquet")
test_df = test_df.reset_index()

In [3]:
import numpy as np
import pandas as pd

print("NaNs in training set:")
print(train_df.isna().sum().sort_values(ascending=False).head())

print("\nInfs in training set:")
print(np.isinf(train_df.select_dtypes(include=[np.number])).sum().sort_values(ascending=False).head())

NaNs in training set:
X852    0
X853    0
X854    0
X855    0
X856    0
dtype: int64

Infs in training set:
X702    525887
X703    525887
X704    525887
X705    525887
X706    525887
dtype: int64


In [4]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

nan_cols = train_df.columns[train_df.isna().sum() > 0].tolist()

train_df.drop(columns=nan_cols, inplace=True)
test_df.drop(columns=nan_cols, inplace=True)

In [5]:
constant_cols = [col for col in train_df.columns if train_df[col].nunique() == 1]

train_df.drop(columns=constant_cols, inplace=True)
test_df.drop(columns=constant_cols, inplace=True)

In [6]:
# skew_vals = train_df.skew(numeric_only=True)
# skewed_cols = skew_vals[skew_vals.abs() > 5].index.tolist()

# train_df.drop(columns=skewed_cols, inplace=True)
# test_df.drop(columns=skewed_cols, inplace=True)

In [7]:
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

train_df.to_parquet("../data/train_clean_v2.parquet")
test_df.to_parquet("../data/test_clean_v2.parquet")

In [8]:
train_df.describe()

Unnamed: 0,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,...,X882,X883,X884,X885,X886,X887,X888,X889,X890,label
count,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,...,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0,525887.0
mean,9.968003,10.174168,131.726685,132.673935,264.400665,-0.006026,-0.000243,-0.000353,-0.000425,-0.000263,...,1.95933,1.954978,0.498112,0.498062,0.497997,0.497872,0.497525,0.497372,0.995394,0.036126
std,15.645741,15.889582,307.267242,309.80304,588.618774,0.53834,0.613746,0.771744,0.857115,0.464695,...,0.844732,0.598279,0.712811,0.698678,0.684946,0.666936,0.61544,0.546439,0.850509,1.009914
min,0.001,0.001,0.0,0.0,0.0,-2.787539,-5.861927,-6.125439,-6.093792,-2.85504,...,0.433416,0.686524,4e-06,6e-06,9e-06,2.4e-05,0.003903,0.015654,0.092913,-24.416615
25%,2.634,2.678,26.407,27.021,60.688499,-0.370635,-0.314135,-0.441536,-0.521693,-0.295502,...,1.367222,1.521202,0.000162,0.000193,0.000241,0.000997,0.033156,0.066908,0.244373,-0.381585
50%,6.415,6.538,57.014999,58.047001,120.799004,-0.015991,-0.001713,-0.00301,-0.003948,-0.002545,...,1.831805,1.882957,0.000634,0.001036,0.004593,0.039178,0.127865,0.161757,0.634448,0.016262
75%,13.085,13.33,127.639,129.110001,256.73349,0.349104,0.310181,0.43196,0.50783,0.287728,...,2.369648,2.274296,1.473225,1.472025,1.442417,1.380184,1.12048,0.999344,1.701838,0.434135
max,1114.932007,1352.964966,17614.400391,17686.234375,28701.419922,2.972741,6.151366,6.488532,6.394355,3.184776,...,6.881902,4.74538,1.584315,1.584315,1.584315,1.584315,1.583862,1.55791,2.769604,20.740271
