# Новые данные

In [12]:
import numpy as np
import pandas as pd
# import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# import shap
from typing import Tuple, Dict, Any
import warnings
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")

In [13]:
df = pd.read_csv("BTCUSDT.csv")

In [14]:
df.head()

Unnamed: 0,date,open_time,open,high,low,close,asset_volume,volume,number_of_trades,buy_volume_quote,sell_volume_quote
0,2024-09-01,2024-09-01 00:00:00,58941.9,58941.9,58941.9,58941.9,0.224,13202.9856,1.0,13202.9856,0.0
1,2024-09-01,2024-09-01 00:00:01,58941.9,58941.9,58941.9,58941.9,0.0,0.0,0.0,0.0,0.0
2,2024-09-01,2024-09-01 00:00:02,58941.8,58941.8,58941.8,58941.8,0.115,6778.307,3.0,0.0,6778.307
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,461275.8064
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,2887.8101


In [15]:
df.shape

(7603200, 11)

In [16]:
df.rename(columns={'open': 'Open', 'high': 'High', 'low': 'Low', 'close': 'Close', 'volume': 'Volume'}, inplace=True)

In [17]:
df.isna().sum()

date                 0
open_time            0
Open                 0
High                 0
Low                  0
Close                0
asset_volume         0
Volume               0
number_of_trades     0
buy_volume_quote     0
sell_volume_quote    0
dtype: int64

## Feture Engineering

In [18]:
# Отражает максимальный размер заказа по сравнению с общим объемом торгов.
# Чем больше заказ, тем большее влияние он может оказать на рынок.

max_order_pct = 0.03

In [19]:
def calculate_price_metrics(df):
    # Доходность и логарифмическая доходность
    df["returns"] = df["Close"].pct_change() # процентное изменение между текущим и предыдущим элементом
    df["log_returns"] = np.log1p(df["returns"])

    # Волатильность
    df["vol_short"] = df["returns"].rolling(5).std() * np.sqrt(252)
    df["vol_mid"] = df["returns"].rolling(21).std() * np.sqrt(252)
    df["vol_long"] = df["returns"].rolling(63).std() * np.sqrt(252)

    # Ranges and spreads (Диапазоны и спреды):
    df["true_range"] = np.maximum(
        df["High"] - df["Low"],
        np.maximum(
            abs(df["High"] - df["Close"].shift(1)),
            abs(df["Low"] - df["Close"].shift(1)),
        ),
    )
    df["rel_spread"] = (df["High"] - df["Low"]) / df["Close"]

    # Нелинейные функции
    df["price_acceleration"] = df["returns"].diff()
    df["range_intensity"] = (df["true_range"] / df["Close"]) * np.sqrt(df["Volume"])

    price_cols = df.filter(
        regex="^(vol_|returns|true_range|rel_spread|price|range)"
    ).columns
    df[price_cols] = df[price_cols].fillna(method="ffill").fillna(0)

    return df

In [20]:
def calculate_volume_metrics(df):
    # Основные метрики объема рынка
    df["log_volume"] = np.log1p(df["Volume"])

    # Скользящие средние объема с разными временными интервалами
    df["vol_ma_short"] = df["Volume"].rolling(5).mean()
    df["vol_ma_mid"] = df["Volume"].rolling(20).mean()
    df["vol_ma_long"] = df["Volume"].rolling(60).mean()
    df["vol_30_min"] = df["Volume"].rolling(1800).mean()
    df["vol_hour"] = df["Volume"].rolling(3600).mean()
    df["vol_2hour"] = df["Volume"].rolling(7200).mean()
    df["vol_day"] = df["Volume"].rolling(86400).mean()

    # Отношения объемов
    df["vol_ratio_short"] = df["Volume"] / df["vol_ma_short"]
    df["vol_ratio_mid"] = df["Volume"] / df["vol_ma_mid"]
    df["vol_ratio_long"] = df["Volume"] / df["vol_ma_long"]

    # Нелинейные метрики объема
    df["vol_impact"] = np.power(df["Volume"] / df["vol_ma_mid"], 0.6)
    df["vol_surge"] = (df["Volume"] / df["vol_ma_long"]).clip(upper=5)

    volume_cols = df.filter(regex="^(vol_|log_volume)").columns
    df[volume_cols] = df[volume_cols].fillna(method="ffill").fillna(1)

    return df

In [21]:
def calculate_market_impact(df):
    # Меры ликвидности
    df["amihud_illiq"] = abs(df["returns"]) / (df["Volume"] * df["Close"])
    df["turnover"] = df["Volume"] * df["Close"]
    df["turnover_vol"] = (
        df["turnover"].rolling(21).std() / df["turnover"].rolling(21).mean()
    )

    # Компоненты рыночного воздействия
    df["base_impact"] = np.power(
        max_order_pct / df["vol_ratio_mid"].clip(lower=1e-8), 0.5
    )
    df["vol_adjusted_impact"] = df["base_impact"] * np.exp(df["vol_surge"] - 1)

    # Нелинейные комбинации
    df["impact_score"] = (
        df["base_impact"]
        * np.exp(df["vol_short"] * 2)
        * np.power(df["amihud_illiq"].clip(lower=1e-8), 0.3)
    )

    impact_cols = [
        "amihud_illiq",
        "turnover",
        "turnover_vol",
        "base_impact",
        "vol_adjusted_impact",
        "impact_score",
    ]
    df[impact_cols] = df[impact_cols].fillna(method="ffill").fillna(0)

    return df

In [22]:
def calculate_slippage(df):
    # Базовые компоненты
    df["spread_cost"] = df["rel_spread"] * 0.5
    df["volatility_cost"] = df["vol_mid"] * np.exp(df["vol_surge"] - 1) * 0.2

    # Рыночное воздействие со спадом
    df["market_impact"] = (
        0.1
        * np.power(max_order_pct / df["vol_ratio_mid"].clip(lower=1e-8), 0.6)
        * df["vol_mid"]
        * np.exp(-df["turnover_vol"])
    )

    # Некоторые дополнительные компоненты
    df["momentum_cost"] = (
        0.05 * abs(df["returns"]) * np.sign(df["price_acceleration"])
    )
    df["liquidity_cost"] = 0.1 * np.power(df["amihud_illiq"].clip(lower=1e-8), 0.3)

    # Случайный шум
    noise = np.random.normal(0, 0.0005, len(df))

    # Комбинирование компонент
    df["slippage"] = (
        df["spread_cost"]
        + df["market_impact"] * (1 + df["volatility_cost"])
        + df["momentum_cost"] * df["liquidity_cost"]
        + noise
    ).clip(
        0, 0.05
    )  # Сверху ограничим 5%, снизу 0%

    cost_cols = [
        "spread_cost",
        "volatility_cost",
        "market_impact",
        "momentum_cost",
        "liquidity_cost",
        "slippage",
    ]
    df[cost_cols] = df[cost_cols].fillna(method="ffill").fillna(0)

    return df

In [None]:
df = calculate_price_metrics(df)
df = calculate_volume_metrics(df)
df = calculate_market_impact(df)
df = calculate_slippage(df)
df.head()

## Применение моделей

In [None]:
df['slippage'] = (df['Close'] - df['Open']) / df['Open']
df['slippage'] = df['slippage']
df['slippage'] = df['slippage'].shift(-1)
df = df.dropna(subset=['slippage'])
# df['slippage'] = (df['Close'].shift(-1) - df['Close']) / df['Close'] * 100

In [None]:
df.head(10)

Unnamed: 0,date,open_time,Open,High,Low,Close,asset_volume,Volume,number_of_trades,buy_volume_quote,...,turnover_vol,base_impact,vol_adjusted_impact,impact_score,spread_cost,volatility_cost,market_impact,momentum_cost,liquidity_cost,slippage
0,2024-09-01,2024-09-01 00:00:00,58941.9,58941.9,58941.9,58941.9,0.224,13202.9856,1.0,13202.9856,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.0
1,2024-09-01,2024-09-01 00:00:01,58941.9,58941.9,58941.9,58941.9,0.0,0.0,0.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,0.0,0.000398,0.0
2,2024-09-01,2024-09-01 00:00:02,58941.8,58941.8,58941.8,58941.8,0.115,6778.307,3.0,0.0,...,0.0,0.173205,0.173205,0.00069,0.0,0.0,0.0,-8.48293e-08,0.000398,-0.000117
3,2024-09-01,2024-09-01 00:00:03,58941.8,58941.9,58934.9,58934.9,13.513,796478.3917,116.0,335202.5853,...,0.0,0.173205,0.173205,0.00069,5.938756e-05,0.0,0.0,-5.853231e-06,0.000398,-2e-06
4,2024-09-01,2024-09-01 00:00:04,58935.0,58935.0,58934.9,58934.9,0.751,44260.1801,17.0,41372.37,...,0.0,0.173205,0.173205,0.00069,8.483937e-07,0.0,0.0,0.0,0.000398,-2e-06
5,2024-09-01,2024-09-01 00:00:05,58935.0,58935.0,58934.9,58934.9,1.684,99246.3798,15.0,4832.67,...,0.0,0.173205,0.173205,0.000691,8.483937e-07,0.0,0.0,0.0,0.000398,0.0
6,2024-09-01,2024-09-01 00:00:06,58935.0,58935.0,58934.9,58935.0,2.477,145981.9942,41.0,145510.515,...,0.0,0.173205,0.173205,0.000691,8.483923e-07,0.0,0.0,8.483937e-08,0.000398,1.4e-05
7,2024-09-01,2024-09-01 00:00:07,58935.0,58935.9,58935.0,58935.8,2.088,123056.7478,38.0,89463.3418,...,0.0,0.173205,0.173205,0.000691,7.635427e-06,0.0,0.0,6.787138e-07,0.000398,0.0
8,2024-09-01,2024-09-01 00:00:08,58935.9,58935.9,58935.8,58935.9,0.423,24929.8852,6.0,24635.2062,...,0.0,0.173205,0.173205,0.00069,8.483793e-07,0.0,0.0,-8.483808e-08,0.000398,-4.1e-05
9,2024-09-01,2024-09-01 00:00:09,58935.8,58935.9,58933.4,58933.4,0.642,37836.6912,38.0,6070.3977,...,0.0,0.173205,0.173205,0.00069,2.121038e-05,0.0,0.0,-2.120948e-06,0.000398,0.0


In [None]:
correlation_matrix = df.drop(columns=['date', 'open_time'], errors='ignore').corr()

slippage_correlation = correlation_matrix['slippage']
slippage_correlation.abs().sort_values(ascending=False)

Unnamed: 0,slippage
slippage,1.0
returns,0.078784
log_returns,0.078784
momentum_cost,0.058953
price_acceleration,0.039787
buy_volume_quote,0.017553
sell_volume_quote,0.01413
vol_ma_long,0.004128
vol_ma_mid,0.003993
range_intensity,0.003656


In [None]:
X = df[df['slippage'] != 0].drop(columns=['date', 'open_time', 'slippage'], errors='ignore')
y = df[df['slippage'] != 0]['slippage']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def viewing_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print(f"R²: {r2}")

### CatBoost

In [None]:
model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x793f955e4580>

In [None]:
y_pred = model.predict(X_test)

In [None]:
viewing_metrics(y_test, y_pred)

MSE: 9.356997621302116e-09
MAE: 5.2590829734703365e-05
R²: 0.022148738259964995


In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
slippage,Unnamed: 1_level_1
0.000000,583912
0.000001,44
-0.000002,41
-0.000001,40
0.000002,40
...,...
-0.000002,1
-0.000148,1
0.000067,1
-0.000103,1


In [None]:
y_test

Unnamed: 0,slippage
7147257,-0.000050
397238,-0.000082
6279830,0.000001
237076,0.000000
335465,0.000000
...,...
4716662,-0.000149
7414844,0.000062
6866974,0.000000
3628505,0.000000


In [None]:
y_pred

array([-1.66708164e-05, -2.18524505e-05, -1.23802283e-05, ...,
       -9.77967575e-07,  1.04344939e-07, -1.60492226e-06])

1. Подобрать гиперпараметры
2. Глянуть на feature importance, отбросить мб ненужные столбцы
3. Скорее всего будет сильно влять volume (мб еще добавить несколько размеров окон для volume)
4. Возможно стоит сделать сначала классификатор на определение 0, а затем уже регрессор на ненулевых проскальзываниях

### Default LinReg

In [None]:
X_train = X_train.fillna(0)
y_train = y_train.fillna(0)

In [None]:
from sklearn.linear_model import LinearRegression

linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

In [None]:
X_test = X_test.fillna(0)
y_test = y_test.fillna(0)

In [None]:
linreg_y_pred = linreg_model.predict(X_test)

In [None]:
viewing_metrics(y_test, linreg_y_pred)

MSE: 9.240778613824831e-05
MAE: 0.005029909839015048
R²: 0.0007709240169512244


### RandomForest

In [None]:
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)
random_forest_model.fit(X_train, y_train)

In [None]:
random_forest_y_pred = random_forest_model.predict(X_test)

In [None]:
viewing_metrics(y_test, random_forest_y_pred)