# Новые данные

In [1]:
!pip install catboost scikit-learn seaborn

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2.0,>=1.16.0 (from catboost)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas>=0.24 (from catboost)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scipy (from catboost)
  Downloading scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collec

In [1]:
import numpy as np
import pandas as pd
# import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# import shap
from typing import Tuple, Dict, Any
import warnings
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor


# from google.colab import drive
# drive.mount('/content/drive')

warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv("BTCUSDT.csv")

In [8]:
df.head()

Unnamed: 0,date,open_time,open,high,low,close,asset_volume,volume,number_of_trades,buy_volume_quote,sell_volume_quote
0,2024-09-01,2024-09-01 00:00:00,58941.9,58941.9,58941.9,58941.9,0.224,13202.9856,1.0,13202.9856,0.0
1,2024-09-01,2024-09-01 00:00:01,58941.9,58941.9,58941.9,58941.9,0.0,0.0,0.0,0.0,0.0
2,2024-09-01,2024-09-01 00:00:02,58941.8,58941.8,58941.8,58941.8,0.115,6778.307,3.0,0.0,6778.307
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,461275.8064
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,2887.8101


In [9]:
df.shape

(7603200, 11)

In [10]:
df.rename(columns={'open': 'Open', 'high': 'High', 'low': 'Low', 'close': 'Close', 'volume': 'Volume'}, inplace=True)

In [11]:
df.isna().sum()

date                 0
open_time            0
Open                 0
High                 0
Low                  0
Close                0
asset_volume         0
Volume               0
number_of_trades     0
buy_volume_quote     0
sell_volume_quote    0
dtype: int64

## Feture Engineering

In [12]:
# Отражает максимальный размер заказа по сравнению с общим объемом торгов.
# Чем больше заказ, тем большее влияние он может оказать на рынок.

max_order_pct = 0.03

In [13]:
def calculate_price_metrics(df):
    # Доходность и логарифмическая доходность
    df["returns"] = df["Close"].pct_change() # процентное изменение между текущим и предыдущим элементом
    df["log_returns"] = np.log1p(df["returns"])

    # Волатильность
    df["vol_short"] = df["returns"].rolling(5).std() * np.sqrt(252)
    df["vol_mid"] = df["returns"].rolling(21).std() * np.sqrt(252)
    df["vol_long"] = df["returns"].rolling(63).std() * np.sqrt(252)

    # Ranges and spreads (Диапазоны и спреды):
    df["true_range"] = np.maximum(
        df["High"] - df["Low"],
        np.maximum(
            abs(df["High"] - df["Close"].shift(1)),
            abs(df["Low"] - df["Close"].shift(1)),
        ),
    )
    df["rel_spread"] = (df["High"] - df["Low"]) / df["Close"]

    # Нелинейные функции
    df["price_acceleration"] = df["returns"].diff()
    df["range_intensity"] = (df["true_range"] / df["Close"]) * np.sqrt(df["Volume"])

    price_cols = df.filter(
        regex="^(vol_|returns|true_range|rel_spread|price|range)"
    ).columns
    df[price_cols] = df[price_cols].fillna(method="ffill").fillna(0)

    return df

In [14]:
def calculate_volume_metrics(df):
    # Основные метрики объема рынка
    df["log_volume"] = np.log1p(df["Volume"])

    # Скользящие средние объема с разными временными интервалами
    df["vol_ma_short"] = df["Volume"].rolling(5).mean()
    df["vol_ma_mid"] = df["Volume"].rolling(20).mean()
    df["vol_ma_long"] = df["Volume"].rolling(60).mean()
    df["vol_30_min"] = df["Volume"].rolling(1800).mean()
    df["vol_hour"] = df["Volume"].rolling(3600).mean()
    df["vol_2hour"] = df["Volume"].rolling(7200).mean()
    df["vol_day"] = df["Volume"].rolling(86400).mean()

    # Отношения объемов
    df["vol_ratio_short"] = df["Volume"] / df["vol_ma_short"]
    df["vol_ratio_mid"] = df["Volume"] / df["vol_ma_mid"]
    df["vol_ratio_long"] = df["Volume"] / df["vol_ma_long"]

    # Нелинейные метрики объема
    df["vol_impact"] = np.power(df["Volume"] / df["vol_ma_mid"], 0.6)
    df["vol_surge"] = (df["Volume"] / df["vol_ma_long"]).clip(upper=5)

    volume_cols = df.filter(regex="^(vol_|log_volume)").columns
    df[volume_cols] = df[volume_cols].fillna(method="ffill").fillna(1)

    return df

In [15]:
def calculate_market_impact(df):
    # Меры ликвидности
    df["amihud_illiq"] = abs(df["returns"]) / (df["Volume"] * df["Close"])
    df["turnover"] = df["Volume"] * df["Close"]
    df["turnover_vol"] = (
        df["turnover"].rolling(21).std() / df["turnover"].rolling(21).mean()
    )

    # Компоненты рыночного воздействия
    df["base_impact"] = np.power(
        max_order_pct / df["vol_ratio_mid"].clip(lower=1e-8), 0.5
    )
    df["vol_adjusted_impact"] = df["base_impact"] * np.exp(df["vol_surge"] - 1)

    # Нелинейные комбинации
    df["impact_score"] = (
        df["base_impact"]
        * np.exp(df["vol_short"] * 2)
        * np.power(df["amihud_illiq"].clip(lower=1e-8), 0.3)
    )

    impact_cols = [
        "amihud_illiq",
        "turnover",
        "turnover_vol",
        "base_impact",
        "vol_adjusted_impact",
        "impact_score",
    ]
    df[impact_cols] = df[impact_cols].fillna(method="ffill").fillna(0)

    return df

In [16]:
def calculate_slippage(df):
    # Базовые компоненты
    df["spread_cost"] = df["rel_spread"] * 0.5
    df["volatility_cost"] = df["vol_mid"] * np.exp(df["vol_surge"] - 1) * 0.2

    # Рыночное воздействие со спадом
    df["market_impact"] = (
        0.1
        * np.power(max_order_pct / df["vol_ratio_mid"].clip(lower=1e-8), 0.6)
        * df["vol_mid"]
        * np.exp(-df["turnover_vol"])
    )

    # Некоторые дополнительные компоненты
    df["momentum_cost"] = (
        0.05 * abs(df["returns"]) * np.sign(df["price_acceleration"])
    )
    df["liquidity_cost"] = 0.1 * np.power(df["amihud_illiq"].clip(lower=1e-8), 0.3)

    # Случайный шум
    noise = np.random.normal(0, 0.0005, len(df))

    # Комбинирование компонент
    df["slippage"] = (
        df["spread_cost"]
        + df["market_impact"] * (1 + df["volatility_cost"])
        + df["momentum_cost"] * df["liquidity_cost"]
        + noise
    ).clip(
        0, 0.05
    )  # Сверху ограничим 5%, снизу 0%

    cost_cols = [
        "spread_cost",
        "volatility_cost",
        "market_impact",
        "momentum_cost",
        "liquidity_cost",
        "slippage",
    ]
    df[cost_cols] = df[cost_cols].fillna(method="ffill").fillna(0)

    return df

In [17]:
df = calculate_price_metrics(df)
df = calculate_volume_metrics(df)
df = calculate_market_impact(df)
df = calculate_slippage(df)
df.head()

Unnamed: 0,date,open_time,Open,High,Low,Close,asset_volume,Volume,number_of_trades,buy_volume_quote,...,turnover_vol,base_impact,vol_adjusted_impact,impact_score,spread_cost,volatility_cost,market_impact,momentum_cost,liquidity_cost,slippage
0,2024-09-01,2024-09-01 00:00:00,58941.9,58941.9,58941.9,58941.9,0.224,13202.9856,1.0,13202.9856,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.000559
1,2024-09-01,2024-09-01 00:00:01,58941.9,58941.9,58941.9,58941.9,0.0,0.0,0.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.0
2,2024-09-01,2024-09-01 00:00:02,58941.8,58941.8,58941.8,58941.8,0.115,6778.307,3.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,-8.48293e-08,0.000398,0.0
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,...,0.0,0.173205,0.173205,0.00069,5.938756e-05,0.0,0.0,-5.853231e-06,0.000398,0.0
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,...,0.0,0.173205,0.173205,0.00069,8.483937e-07,0.0,0.0,0.0,0.000398,0.000237


## Применение моделей

In [18]:
df['slippage'] = (df['Close'] - df['Open']) / df['Open']
df['slippage'] = df['slippage']
df['slippage'] = df['slippage'].shift(-1)
df = df.dropna(subset=['slippage'])
# df['slippage'] = (df['Close'].shift(-1) - df['Close']) / df['Close'] * 100

In [19]:
df.head(10)

Unnamed: 0,date,open_time,Open,High,Low,Close,asset_volume,Volume,number_of_trades,buy_volume_quote,...,turnover_vol,base_impact,vol_adjusted_impact,impact_score,spread_cost,volatility_cost,market_impact,momentum_cost,liquidity_cost,slippage
0,2024-09-01,2024-09-01 00:00:00,58941.9,58941.9,58941.9,58941.9,0.224,13202.9856,1.0,13202.9856,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.0
1,2024-09-01,2024-09-01 00:00:01,58941.9,58941.9,58941.9,58941.9,0.0,0.0,0.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.0
2,2024-09-01,2024-09-01 00:00:02,58941.8,58941.8,58941.8,58941.8,0.115,6778.307,3.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,-8.48293e-08,0.000398,-0.000117
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,...,0.0,0.173205,0.173205,0.00069,5.938756e-05,0.0,0.0,-5.853231e-06,0.000398,-2e-06
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,...,0.0,0.173205,0.173205,0.00069,8.483937e-07,0.0,0.0,0.0,0.000398,-2e-06
5,2024-09-01,2024-09-01 00:00:05,58935.0,58935.0,58934.9,58934.9,1.684,99246.3798,15.0,4832.67,...,0.0,0.173205,0.173205,0.000691,8.483937e-07,0.0,0.0,0.0,0.000398,0.0
6,2024-09-01,2024-09-01 00:00:06,58935.0,58935.0,58934.9,58935.0,2.477,145981.9942,41.0,145510.515,...,0.0,0.173205,0.173205,0.000691,8.483923e-07,0.0,0.0,8.483937e-08,0.000398,1.4e-05
7,2024-09-01,2024-09-01 00:00:07,58935.0,58935.9,58935.0,58935.8,2.088,123056.7478,38.0,89463.3418,...,0.0,0.173205,0.173205,0.000691,7.635427e-06,0.0,0.0,6.787138e-07,0.000398,0.0
8,2024-09-01,2024-09-01 00:00:08,58935.9,58935.9,58935.8,58935.9,0.423,24929.8852,6.0,24635.2062,...,0.0,0.173205,0.173205,0.00069,8.483793e-07,0.0,0.0,-8.483808e-08,0.000398,-4.1e-05
9,2024-09-01,2024-09-01 00:00:09,58935.8,58935.9,58933.4,58933.4,0.642,37836.6912,38.0,6070.3977,...,0.0,0.173205,0.173205,0.00069,2.121038e-05,0.0,0.0,-2.120948e-06,0.000398,0.0


In [20]:
correlation_matrix = df.drop(columns=['date', 'open_time'], errors='ignore').corr()

slippage_correlation = correlation_matrix['slippage']
slippage_correlation.abs().sort_values(ascending=False)

slippage               1.000000
returns                0.078784
log_returns            0.078784
momentum_cost          0.058953
price_acceleration     0.039787
buy_volume_quote       0.017553
sell_volume_quote      0.014130
vol_ma_long            0.004128
vol_ma_mid             0.003993
range_intensity        0.003656
asset_volume           0.003163
vol_mid                0.003084
number_of_trades       0.002834
vol_ma_short           0.002626
Volume                 0.002397
vol_long               0.002360
vol_short              0.001842
vol_30_min             0.001707
turnover               0.001578
rel_spread             0.001349
spread_cost            0.001349
vol_hour               0.001293
log_volume             0.001152
true_range             0.001137
vol_2hour              0.000908
vol_surge              0.000837
vol_impact             0.000706
vol_ratio_short        0.000654
vol_ratio_long         0.000646
vol_day                0.000622
volatility_cost        0.000567
vol_adju

In [None]:
X = df[df['slippage'] != 0].drop(columns=['date', 'open_time', 'slippage'], errors='ignore')
y = df[df['slippage'] != 0]['slippage']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
def viewing_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R²: {r2}")

### CatBoost

In [None]:
model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7fa5bb07c4d0>

In [None]:
y_pred = model.predict(X_test)

In [None]:
viewing_metrics(y_test, y_pred)

MSE: 9.318195200919647e-09
MAE: 5.249651789380657e-05
R²: 0.021741407670804014


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
slippage,Unnamed: 1_level_1
0.000001,45
0.000002,44
0.000002,37
-0.000001,37
-0.000002,35
...,...
0.000217,1
-0.000002,1
0.000026,1
-0.000001,1


In [None]:
y_test

Unnamed: 0,slippage
4726858,-0.000072
919625,0.000124
4447654,-0.000001
4731469,0.000249
6087757,-0.000001
...,...
6260613,0.000252
2978405,0.000002
6949372,0.000001
3121105,0.000002


In [None]:
y_pred

array([ 2.32251593e-05, -1.59176283e-05,  6.12496154e-06, ...,
       -9.41022043e-06, -1.74830164e-05, -2.80137321e-05])

1. Подобрать гиперпараметры
2. Глянуть на feature importance, отбросить мб ненужные столбцы
3. Скорее всего будет сильно влять volume (мб еще добавить несколько размеров окон для volume)
4. Возможно стоит сделать сначала классификатор на определение 0, а затем уже регрессор на ненулевых проскальзываниях

### Default LinReg

In [None]:
X_train = X_train.fillna(0)
y_train = y_train.fillna(0)

In [None]:
from sklearn.linear_model import LinearRegression

linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

In [None]:
X_test = X_test.fillna(0)
y_test = y_test.fillna(0)

In [None]:
linreg_y_pred = linreg_model.predict(X_test)

In [None]:
viewing_metrics(y_test, linreg_y_pred)

MSE: 9.516715521880043e-09
MAE: 5.1820266556890074e-05
R²: 0.0009000102173261082


### RandomForest

In [None]:
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)
random_forest_model.fit(X_train, y_train)

In [None]:
random_forest_y_pred = random_forest_model.predict(X_test)

In [None]:
viewing_metrics(y_test, random_forest_y_pred)

### With Orderbook data

In [4]:
df_avg = pd.read_csv('order_book_perp_futures_BTCUSDT/avg_order_book_BTCUSDT.csv')
df_vwap = pd.read_csv('order_book_perp_futures_BTCUSDT/vwap_order_book_BTCUSDT.csv')
df_avg = df_avg.drop(columns=['exchange', 'symbol'], errors='ignore')
df_vwap = df_vwap.drop(columns=['exchange', 'symbol'], errors='ignore')
df_avg = df_avg.rename(columns={'timestamp': 'open_time'})
df_vwap = df_vwap.rename(columns={'timestamp': 'open_time'})

In [21]:
df_merged  = pd.merge(df, df_avg, on='open_time', how='outer')
df_merged = pd.merge(df_merged, df_vwap, on='open_time', how='outer')

In [22]:
df_merged = df_merged.dropna()
df_merged.head()

Unnamed: 0,date,open_time,Open,High,Low,Close,asset_volume,Volume,number_of_trades,buy_volume_quote,...,vwap_asks[5],vwap_bids[5],vwap_asks[6],vwap_bids[6],vwap_asks[7],vwap_bids[7],vwap_asks[8],vwap_bids[8],vwap_asks[9],vwap_bids[9]
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,...,58938.571524,58936.944414,58938.129546,58934.927939,58940.546601,58933.484346,58943.093458,58935.463352,58941.363568,58933.501245
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,...,58935.910636,58933.063511,58936.012766,58933.010116,58936.118798,58932.972572,58936.818345,58932.502222,58936.595,58932.207843
5,2024-09-01,2024-09-01 00:00:05,58935.0,58935.0,58934.9,58934.9,1.684,99246.3798,15.0,4832.67,...,58936.1,58933.399311,58936.2,58933.007075,58936.795402,58932.999203,58937.072859,58932.653846,58937.102075,58932.444444
6,2024-09-01,2024-09-01 00:00:06,58935.0,58935.0,58934.9,58935.0,2.477,145981.9942,41.0,145510.515,...,58936.110787,58933.454805,58936.850291,58933.39996,58936.965719,58933.010413,58937.087629,58932.999334,58937.396359,58932.691304
7,2024-09-01,2024-09-01 00:00:07,58935.0,58935.9,58935.0,58935.8,2.088,123056.7478,38.0,89463.3418,...,58937.087076,58933.401656,58936.751979,58933.983962,58937.473755,58933.473873,58937.418367,58933.409219,58937.605031,58933.411333


In [56]:
def calculate_imbalance(bids, asks):
    return (bids - asks) / (bids + asks + 1e-9)  

df_merged['depth_imbalance'] = calculate_imbalance(
    df_merged[[f'bids[{i}].amount' for i in range(11)]].sum(axis=1),
    df_merged[[f'asks[{i}].amount' for i in range(11)]].sum(axis=1)
)

# Price-weighted imbalance
df_merged['price_weighted_imbalance'] = calculate_imbalance(
    (df_merged[[f'bids[{i}].price' for i in range(11)]] * df_merged[[f'bids[{i}].amount' for i in range(11)]]).sum(axis=1),
    (df_merged[[f'asks[{i}].price' for i in range(11)]] * df_merged[[f'asks[{i}].amount' for i in range(11)]]).sum(axis=1)
)

# Liquidity Spread
df_merged['liquidity_spread'] = df_merged['vwap_asks[0]'] - df_merged['vwap_bids[0]']

# Order Book Depth Ratio
df_merged['depth_ratio'] = (
    df_merged[[f'bids[{i}].amount' for i in range(11)]].sum(axis=1) /
    df_merged[[f'asks[{i}].amount' for i in range(11)]].sum(axis=1)
)

# Effective Spread
df_merged['effective_spread'] = (df_merged['vwap_asks[0]'] - df_merged['vwap_bids[0]']) / ((df_merged['vwap_asks[0]'] + df_merged['vwap_bids[0]']) / 2)

# Order Book Volatility
df_merged['order_book_volatility'] = df_merged[[f'bids[{i}].price' for i in range(11)] + [f'asks[{i}].price' for i in range(11)]].std(axis=1)

# VWAP Slope
df_merged['vwap_slope'] = (
    df_merged['vwap_asks[9]'] - df_merged['vwap_asks[0]']
) / 9

# Best Bid-Ask Ratio
df_merged['best_bid_ask_ratio'] = df_merged['bids[0].amount'] / df_merged['asks[0].amount']

# Impact Volume
df_merged['impact_volume'] = (
    df_merged[[f'bids[{i}].amount' for i in range(11)]].sum(axis=1) +
    df_merged[[f'asks[{i}].amount' for i in range(11)]].sum(axis=1)
)

# Rolling Metrics
rolling_windows = [5, 10, 30]  # seconds
for window in rolling_windows:
    df_merged[f'rolling_vwap_spread_mean_{window}s'] = (
        df_merged['liquidity_spread'].rolling(window=window, min_periods=1).mean()
    )
    df_merged[f'rolling_vwap_spread_std_{window}s'] = (
        df_merged['liquidity_spread'].rolling(window=window, min_periods=1).std()
    )
    df_merged[f'rolling_depth_imbalance_mean_{window}s'] = (
        df_merged['depth_imbalance'].rolling(window=window, min_periods=1).mean()
    )
    df_merged[f'rolling_depth_imbalance_std_{window}s'] = (
        df_merged['depth_imbalance'].rolling(window=window, min_periods=1).std()
    )

# Volume imbalance between buys and sells
df_merged['volume_imbalance'] = (
    df_merged['buy_volume_quote'] - df_merged['sell_volume_quote']
) / (df_merged['buy_volume_quote'] + df_merged['sell_volume_quote'] + 1e-9)

# Traded volume relative to order book depth
df_merged['buy_volume_to_depth_ratio'] = df_merged['buy_volume_quote'] / (
    sum([df_merged[f'asks[{i}].price'] * df_merged[f'asks[{i}].amount'] for i in range(11)])+ 1e-9
)
df_merged['sell_volume_to_depth_ratio'] = df_merged['sell_volume_quote'] / (
    sum([df_merged[f'bids[{i}].price'] * df_merged[f'bids[{i}].amount'] for i in range(11)]) + 1e-9
)

# Traded volume relative to impact volume
df_merged['buy_volume_to_impact_ratio'] = df_merged['buy_volume_quote'] / (df_merged['impact_volume'] + 1e-9)
df_merged['sell_volume_to_impact_ratio'] = df_merged['sell_volume_quote'] / (df_merged['impact_volume'] + 1e-9)

# Rolling Metrics for Traded Volumes
for window in rolling_windows:
    df_merged[f'rolling_buy_volume_mean_{window}s'] = (
        df_merged['buy_volume_quote'].rolling(window=window, min_periods=1).mean()
    )
    df_merged[f'rolling_sell_volume_mean_{window}s'] = (
        df_merged['sell_volume_quote'].rolling(window=window, min_periods=1).mean()
    )
    df_merged[f'rolling_buy_volume_std_{window}s'] = (
        df_merged['buy_volume_quote'].rolling(window=window, min_periods=1).std()
    )
    df_merged[f'rolling_sell_volume_std_{window}s'] = (
        df_merged['sell_volume_quote'].rolling(window=window, min_periods=1).std()
    )

df_merged = df_merged.fillna(0)

In [59]:
correlation_matrix = df_merged.drop(columns=['date', 'open_time'], errors='ignore').corr()

slippage_correlation = correlation_matrix['slippage']
slippage_correlation.abs().sort_values(ascending=False)

slippage                        1.000000
depth_imbalance                 0.150232
depth_ratio                     0.083867
returns                         0.078787
log_returns                     0.078787
                                  ...   
Open                            0.000194
rolling_vwap_spread_mean_30s    0.000119
amihud_illiq                    0.000053
liquidity_cost                       NaN
price_weighted_imbalance             NaN
Name: slippage, Length: 145, dtype: float64

### CatBoost with orderbook data+features

In [60]:
X_merged = df_merged[df_merged['slippage'] != 0].drop(columns=['date', 'open_time', 'slippage'], errors='ignore')
y_merged = df_merged[df_merged['slippage'] != 0]['slippage']
X_train_merged, X_test_merged, y_train_merged, y_test_merged = train_test_split(X_merged, y_merged, test_size=0.2, random_state=42)

model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model.fit(X_train_merged, y_train_merged)
y_pred_merged = model.predict(X_test_merged)

In [61]:
viewing_metrics(y_test_merged, y_pred_merged) 

MSE: 9.0120725874809e-09
MAE: 5.283424335720582e-05
R²: 0.055714983186860634


In [58]:
print(df_merged['sell_volume_to_depth_ratio'])

3          1.660803
4          0.003863
5          0.108289
6          0.000626
7          0.030745
             ...   
7603194    0.020706
7603195    0.005320
7603196    0.123901
7603197    0.077945
7603198    0.133593
Name: sell_volume_to_depth_ratio, Length: 7601594, dtype: float64
