In [467]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

In [468]:
import os
print(os.getcwd())

/Users/alecmitchell-thomson/Desktop/Alpha Fund Bootcamp/Project/Alpha-Fund-Project/notebooks


In [469]:
df = pd.read_csv('../data/processed/cleaned_5_min_data.csv', parse_dates=['timestamp'])
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,missing_bars_day
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,8
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,8
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,8
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,8
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,8
...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,0
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,0
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,0
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,0


## **0. Helper structure from per-ticker rolling operations**

In [470]:
g = df.groupby('ticker', group_keys=False)

## **1. Price base features**

### **1.1 1-bar log return**
- base signal, used absolutely everywhere (vol, labels, tail risk)

In [471]:
df["logret"] = g["close"].apply(lambda x: np.log(x).diff())
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,missing_bars_day,logret
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,8,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,8,-0.000473
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,8,0.000210
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,8,0.000473
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,8,0.000053
...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,0,0.005001
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,0,0.005653
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,0,-0.001952
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,0,-0.008080


### **1.2 Multi-horizon backwards returns**
- Captures short-term momentum/mean reversion

In [472]:
for k in [2, 3, 5, 10]:
    df[f"logret_{k}"] = g["close"].apply(lambda x: np.log(x).diff(k))
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,missing_bars_day,logret,logret_2,logret_3,logret_5,logret_10
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,8,,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,8,-0.000473,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,8,0.000210,-0.000263,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,8,0.000473,0.000683,0.000210,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,8,0.000053,0.000526,0.000736,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,0,0.005001,0.001726,-0.001761,-0.014467,-0.031927
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,0,0.005653,0.010654,0.007379,-0.007821,-0.013862
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,0,-0.001952,0.003701,0.008702,0.001940,-0.018649
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,0,-0.008080,-0.010032,-0.004379,-0.002653,-0.027884


### **1.3 OHLC details**
- shape of each bar in every 5 minute window can tell you about intrabar agression, reversals, uncertainty

In [473]:
df["body"]       = df["close"] - df["open"]
df["upper_wick"] = df["high"] - df[["open", "close"]].max(axis=1)
df["lower_wick"] = df[["open", "close"]].min(axis=1) - df["low"]
df["hl_range"]   = df["high"] - df["low"]
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,missing_bars_day,logret,logret_2,logret_3,logret_5,logret_10,body,upper_wick,lower_wick,hl_range
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,8,,,,,,0.1000,0.0000,0.0100,0.1100
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,8,-0.000473,,,,,0.0000,0.0000,0.0000,0.0000
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,8,0.000210,-0.000263,,,,0.0300,0.0000,0.0000,0.0300
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,8,0.000473,0.000683,0.000210,,,0.0000,0.0000,0.0100,0.0100
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,8,0.000053,0.000526,0.000736,,,0.0500,0.0000,0.0000,0.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,0,0.005001,0.001726,-0.001761,-0.014467,-0.031927,1.9631,1.3369,1.3200,4.6200
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,0,0.005653,0.010654,0.007379,-0.007821,-0.013862,2.1349,0.5191,1.3751,4.0291
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,0,-0.001952,0.003701,0.008702,0.001940,-0.018649,-0.7150,0.4250,2.4600,3.6000
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,0,-0.008080,-0.010032,-0.004379,-0.002653,-0.027884,-3.2300,0.0000,0.3191,3.5491


## **2. Volatility Features**

### **1.2 Realised volatility (RV) over rolling windows**
- cascades happen in high-vol states, so knowing the vol regime is absolutely crucial

In [474]:
for w in [5, 10, 20, 50]:
    df[f"rv_{w}"] = g["logret"].apply(
        lambda x: np.sqrt((x**2).rolling(window=w).sum())
    )
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,logret_5,logret_10,body,upper_wick,lower_wick,hl_range,rv_5,rv_10,rv_20,rv_50
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,0.1000,0.0000,0.0100,0.1100,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,0.0000,0.0000,0.0000,0.0000,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,0.0300,0.0000,0.0000,0.0300,,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,0.0000,0.0000,0.0100,0.0100,,,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,,,0.0500,0.0000,0.0000,0.0500,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,-0.014467,-0.031927,1.9631,1.3369,1.3200,4.6200,0.013641,0.019782,0.022584,0.026826
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,-0.007821,-0.013862,2.1349,0.5191,1.3751,4.0291,0.014732,0.016409,0.023066,0.027380
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,0.001940,-0.018649,-0.7150,0.4250,2.4600,3.6000,0.009147,0.016280,0.023142,0.027436
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,-0.002653,-0.027884,-3.2300,0.0000,0.3191,3.5491,0.011695,0.018138,0.024481,0.028518


### **2.2 Parkinson volatility, using high/low**
- Uses intrabar range, not just close to close between timestamps
$$ \sigma_{parkinson} = \sqrt{\frac{1}{4ln(2)} \sum{ln(\frac{h_i}{l_i})^2}}$$

In [475]:
df["parkinson_bar"] = (1 / (4 * np.log(2))) * (np.log(df["high"] / df["low"]))**2
df["parkinson_20"] = g["parkinson_bar"].apply(lambda s: s.rolling(20).mean())
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,body,upper_wick,lower_wick,hl_range,rv_5,rv_10,rv_20,rv_50,parkinson_bar,parkinson_20
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0.1000,0.0000,0.0100,0.1100,,,,,1.206429e-07,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,0.0000,0.0000,0.0000,0.0000,,,,,0.000000e+00,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,0.0300,0.0000,0.0000,0.0300,,,,,8.974382e-09,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,0.0000,0.0000,0.0100,0.0100,,,,,9.961057e-10,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,0.0500,0.0000,0.0000,0.0500,,,,,2.490526e-08,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,1.9631,1.3369,1.3200,4.6200,0.013641,0.019782,0.022584,0.026826,5.152257e-05,0.000034
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,2.1349,0.5191,1.3751,4.0291,0.014732,0.016409,0.023066,0.027380,3.884834e-05,0.000035
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,-0.7150,0.4250,2.4600,3.6000,0.009147,0.016280,0.023142,0.027436,3.100209e-05,0.000037
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,-3.2300,0.0000,0.3191,3.5491,0.011695,0.018138,0.024481,0.028518,3.028968e-05,0.000038


### **2.3 ATR and compression**
- ATR (average true range) measures the typical size of a price bar, (the average high-low movement of recent bars)
- ATR tells you:
    1. How much the price typically moves per bar
    2. Whether volatility is high or low
    3. Whether the market is expanding or contracting in activity
    4. How active/stressed/quiet the market really is
- High ATR -> high intraday volatility

- Compression is essentially low-term volatility compared to long-term volatility
- Detect it using a ratio:
$$ \text{compression} = \frac{\text{ATR}_{5}}{\text{ATR}_{50}}$$

$$ \text{compression} = \frac{\text{rv}_{5}}{\text{rv}_{50}} $$
- compression << 1, price is moving much less than usual (typical right before the market breaks out)
- compression >> 1, volatility spike or explosive move is already happening

In [476]:
df["atr_5"] = g["hl_range"].apply(lambda s: s.rolling(5).mean())
df["atr_50"] = g["hl_range"].apply(lambda s: s.rolling(50).mean())

df["compression_atr"] = df["atr_5"] / df["atr_50"]
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,hl_range,rv_5,rv_10,rv_20,rv_50,parkinson_bar,parkinson_20,atr_5,atr_50,compression_atr
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0.1100,,,,,1.206429e-07,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,0.0000,,,,,0.000000e+00,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,0.0300,,,,,8.974382e-09,,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,0.0100,,,,,9.961057e-10,,,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,0.0500,,,,,2.490526e-08,,0.04000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,4.6200,0.013641,0.019782,0.022584,0.026826,5.152257e-05,0.000034,4.18570,2.392430,1.749560
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,4.0291,0.014732,0.016409,0.023066,0.027380,3.884834e-05,0.000035,4.08142,2.452012,1.664519
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,3.6000,0.009147,0.016280,0.023142,0.027436,3.100209e-05,0.000037,3.62542,2.499412,1.450509
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,3.5491,0.011695,0.018138,0.024481,0.028518,3.028968e-05,0.000038,3.65364,2.552994,1.431120


## **3. Volume and trade-count features**

### **3.1 Rolling average volume and z-score**

In [477]:
for w in [5, 20, 50]:
    df[f"volume_ma_{w}"] = g["volume"].apply(lambda s: s.rolling(w).mean())
    df[f"volume_z_{w}"]  = g["volume"].apply(lambda s: (s - s.rolling(w).mean()) / s.rolling(w).std())

df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,parkinson_20,atr_5,atr_50,compression_atr,volume_ma_5,volume_z_5,volume_ma_20,volume_z_20,volume_ma_50,volume_z_50
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,,,,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,,,,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,,,,,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,,,,,,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,,0.04000,,,3048.8,-0.736731,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,0.000034,4.18570,2.392430,1.749560,1944728.2,-0.557152,1424436.70,0.411562,617011.02,1.343054
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,0.000035,4.08142,2.452012,1.664519,1803649.6,-0.795364,1505602.00,0.219566,650570.26,1.177244
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,0.000037,3.62542,2.499412,1.450509,1676190.2,-1.624456,1558417.10,-0.587672,672614.40,0.503587
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,0.000038,3.65364,2.552994,1.431120,1538464.0,-1.118561,1608408.05,-0.829375,693267.34,0.402520


### **3.2 Trade-count features**
- Some may be quiet big trades, others may be lots of small trades

In [478]:
for w in [5, 20]:
    df[f"trades_ma_{w}"] = g["num_trades"].apply(lambda s: s.rolling(w).mean())
    df[f"trades_z_{w}"]  = (
        (df["num_trades"] - df[f"trades_ma_{w}"])
        / g["num_trades"].apply(lambda s: s.rolling(w).std())
    )
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,volume_ma_5,volume_z_5,volume_ma_20,volume_z_20,volume_ma_50,volume_z_50,trades_ma_5,trades_z_5,trades_ma_20,trades_z_20
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,,,,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,,,,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,,,,,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,,,,,,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,3048.8,-0.736731,,,,,58.4,-0.516548,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,1944728.2,-0.557152,1424436.70,0.411562,617011.02,1.343054,49518.6,-0.489017,33307.05,0.638734
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,1803649.6,-0.795364,1505602.00,0.219566,650570.26,1.177244,45452.0,-1.134266,35212.85,0.251328
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,1676190.2,-1.624456,1558417.10,-0.587672,672614.40,0.503587,42642.2,-1.432009,36683.45,-0.342663
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,1538464.0,-1.118561,1608408.05,-0.829375,693267.34,0.402520,40190.0,-0.957203,38165.50,-0.492278


### **3.3 Volume/trade efficiency**
- How much price moves per unit of activity gives us a signal for market liquidity

In [479]:
df["price_move_abs"] = df["hl_range"]  # or abs(body)
df["volume_per_trade"]  = df["volume"] / (df["num_trades"].replace(0, np.nan))
df["move_per_volume"]   = df["price_move_abs"] / df["volume"].replace(0, np.nan)
df["move_per_trade"] = df["price_move_abs"] / df["num_trades"].replace(0, np.nan)
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,volume_ma_50,volume_z_50,trades_ma_5,trades_z_5,trades_ma_20,trades_z_20,price_move_abs,volume_per_trade,move_per_volume,move_per_trade
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,,,,0.1100,52.737589,0.000015,0.000780
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,,,,0.0000,33.254902,0.000000,0.000000
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,,,,0.0300,75.928571,0.000009,0.000714
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,,,,0.0100,73.500000,0.000006,0.000417
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,,,58.4,-0.516548,,,0.0500,34.088235,0.000043,0.001471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,617011.02,1.343054,49518.6,-0.489017,33307.05,0.638734,4.6200,38.806688,0.000003,0.000100
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,650570.26,1.177244,45452.0,-1.134266,35212.85,0.251328,4.0291,42.283848,0.000002,0.000101
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,672614.40,0.503587,42642.2,-1.432009,36683.45,-0.342663,3.6000,36.043958,0.000003,0.000116
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,693267.34,0.402520,40190.0,-0.957203,38165.50,-0.492278,3.5491,33.793906,0.000003,0.000115


## **4. Microstructure features**

### **4.1 Directional efficiency**
$$ \text{efficiency} = \frac{\text{close} - \text{open}}{\text{high} - \text{low}}$$
- interpretation:
    - +1 -> bar was straight up, no variation within the bar
    - -1 -> bar was straight down
    - 0 -> bar was choppy
- high magnitude efficiency means uni-directional, aggessive order flow:
    - useful for cascade analysis beacuse they happen when one side becomes extremely aggressive (stop loss orders)
    - this would mean they happen around several bars in a row having high directional efficiency


In [480]:
df["efficiency"] = df["body"] / df["hl_range"].replace(0, np.nan)
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,volume_z_50,trades_ma_5,trades_z_5,trades_ma_20,trades_z_20,price_move_abs,volume_per_trade,move_per_volume,move_per_trade,efficiency
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,,,0.1100,52.737589,0.000015,0.000780,0.909091
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,,,0.0000,33.254902,0.000000,0.000000,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,,,0.0300,75.928571,0.000009,0.000714,1.000000
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,,,0.0100,73.500000,0.000006,0.000417,0.000000
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,,58.4,-0.516548,,,0.0500,34.088235,0.000043,0.001471,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,1.343054,49518.6,-0.489017,33307.05,0.638734,4.6200,38.806688,0.000003,0.000100,0.424913
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,1.177244,45452.0,-1.134266,35212.85,0.251328,4.0291,42.283848,0.000002,0.000101,0.529870
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,0.503587,42642.2,-1.432009,36683.45,-0.342663,3.6000,36.043958,0.000003,0.000116,-0.198611
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,0.402520,40190.0,-0.957203,38165.50,-0.492278,3.5491,33.793906,0.000003,0.000115,-0.910090


### **4.2 Impact Proxy**
$$ \lambda = \frac{|\text{return}|}{\text{volume}} $$
- Interpretation:
    - high lambda -> big price move with small volume
    - indicates thin liquidity/fragile book
    - a small sell can trigger a cascade
- Cascades happen in regimes where liquidity is low -> price usually jumps unusually far per unit volume

In [481]:
df["impact_lambda"] = np.abs(df["logret"]) / df["volume"].replace(0, np.nan)
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,trades_ma_5,trades_z_5,trades_ma_20,trades_z_20,price_move_abs,volume_per_trade,move_per_volume,move_per_trade,efficiency,impact_lambda
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,,0.1100,52.737589,0.000015,0.000780,0.909091,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,,0.0000,33.254902,0.000000,0.000000,,2.789939e-07
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,,0.0300,75.928571,0.000009,0.000714,1.000000,6.595392e-08
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,,0.0100,73.500000,0.000006,0.000417,0.000000,2.681827e-07
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,58.4,-0.516548,,,0.0500,34.088235,0.000043,0.001471,1.000000,4.534080e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,49518.6,-0.489017,33307.05,0.638734,4.6200,38.806688,0.000003,0.000100,0.424913,2.798072e-09
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,45452.0,-1.134266,35212.85,0.251328,4.0291,42.283848,0.000002,0.000101,0.529870,3.352645e-09
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,42642.2,-1.432009,36683.45,-0.342663,3.6000,36.043958,0.000003,0.000116,-0.198611,1.751737e-09
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,40190.0,-0.957203,38165.50,-0.492278,3.5491,33.793906,0.000003,0.000115,-0.910090,7.732909e-09


### **4.3 Signed volume**
$$ \text{signed volume} = sign(\text{close} - \text{open}) * \text{volume} $$
- Interpretation:
    - measures direction + intensity of order flow
    - many consecutive positive signed-volume bars -> strong buyer pressure
    - many negative volume bars -> sustained sell pressure (pre cascade)
- Before a sharp drop, seller typically dominate order flow for 10-30 mins

In [482]:
df["signed_volume"] = df["volume"] * np.sign(df["body"])
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,trades_z_5,trades_ma_20,trades_z_20,price_move_abs,volume_per_trade,move_per_volume,move_per_trade,efficiency,impact_lambda,signed_volume
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,,,0.1100,52.737589,0.000015,0.000780,0.909091,,7436.0
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,,,0.0000,33.254902,0.000000,0.000000,,2.789939e-07,0.0
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,,,0.0300,75.928571,0.000009,0.000714,1.000000,6.595392e-08,3189.0
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,,,0.0100,73.500000,0.000006,0.000417,0.000000,2.681827e-07,0.0
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,-0.516548,,,0.0500,34.088235,0.000043,0.001471,1.000000,4.534080e-08,1159.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,-0.489017,33307.05,0.638734,4.6200,38.806688,0.000003,0.000100,0.424913,2.798072e-09,1787242.0
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,-1.134266,35212.85,0.251328,4.0291,42.283848,0.000002,0.000101,0.529870,3.352645e-09,1686153.0
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,-1.432009,36683.45,-0.342663,3.6000,36.043958,0.000003,0.000116,-0.198611,1.751737e-09,-1114335.0
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,-0.957203,38165.50,-0.492278,3.5491,33.793906,0.000003,0.000115,-0.910090,7.732909e-09,-1044840.0


## **5. VWAP based features**

### **5.1 Distance from VWAP**
- volume weighted average price
- cascades often happen when price is far from VWAP, and VWAP reversion fails

In [483]:
df["dist_vwap"] = df["close"] - df["vwap"]
df["dist_vwap_pct"] = df["dist_vwap"] / df["vwap"]
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,trades_z_20,price_move_abs,volume_per_trade,move_per_volume,move_per_trade,efficiency,impact_lambda,signed_volume,dist_vwap,dist_vwap_pct
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,0.1100,52.737589,0.000015,0.000780,0.909091,,7436.0,0.0600,0.000315
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,,0.0000,33.254902,0.000000,0.000000,,2.789939e-07,0.0,0.0192,0.000101
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,,0.0300,75.928571,0.000009,0.000714,1.000000,6.595392e-08,3189.0,0.0025,0.000013
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,,0.0100,73.500000,0.000006,0.000417,0.000000,2.681827e-07,0.0,0.0031,0.000016
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,,0.0500,34.088235,0.000043,0.001471,1.000000,4.534080e-08,1159.0,0.0206,0.000108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,0.638734,4.6200,38.806688,0.000003,0.000100,0.424913,2.798072e-09,1787242.0,1.1910,0.003083
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,0.251328,4.0291,42.283848,0.000002,0.000101,0.529870,3.352645e-09,1686153.0,1.0037,0.002582
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,-0.342663,3.6000,36.043958,0.000003,0.000116,-0.198611,1.751737e-09,-1114335.0,0.7920,0.002040
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,-0.492278,3.5491,33.793906,0.000003,0.000115,-0.910090,7.732909e-09,-1044840.0,-1.1234,-0.002903


### **5.2 VWAP slop and curvature**
- Strong negative slope + stretched VWAP_pct signifies a dangerous environment

In [484]:
df["vwap_slope"] = g["vwap"].apply(lambda x: x.diff())
df["vwap_slope2"] = g["vwap_slope"].apply(lambda x: x.diff())  # acceleration
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,volume_per_trade,move_per_volume,move_per_trade,efficiency,impact_lambda,signed_volume,dist_vwap,dist_vwap_pct,vwap_slope,vwap_slope2
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,52.737589,0.000015,0.000780,0.909091,,7436.0,0.0600,0.000315,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,33.254902,0.000000,0.000000,,2.789939e-07,0.0,0.0192,0.000101,-0.0492,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,75.928571,0.000009,0.000714,1.000000,6.595392e-08,3189.0,0.0025,0.000013,0.0567,0.1059
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,73.500000,0.000006,0.000417,0.000000,2.681827e-07,0.0,0.0031,0.000016,0.0894,0.0327
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,34.088235,0.000043,0.001471,1.000000,4.534080e-08,1159.0,0.0206,0.000108,-0.0075,-0.0969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,38.806688,0.000003,0.000100,0.424913,2.798072e-09,1787242.0,1.1910,0.003083,0.6112,3.0810
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,42.283848,0.000002,0.000101,0.529870,3.352645e-09,1686153.0,1.0037,0.002582,2.3842,1.7730
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,36.043958,0.000003,0.000116,-0.198611,1.751737e-09,-1114335.0,0.7920,0.002040,-0.5483,-2.9325
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,33.793906,0.000003,0.000115,-0.910090,7.732909e-09,-1044840.0,-1.1234,-0.002903,-1.2146,-0.6663


## **6. Marget regime and cross asset context**

### **6.1 Pivot index returns by timestamp**
- Use SPY/QQQ/IWM as context for every ticker

In [485]:
df_reset = df.reset_index(drop=True)

idx_rets = (
    df_reset[df_reset["ticker"].isin(["SPY", "QQQ"])]
    .pivot(index="timestamp", columns="ticker", values="logret")
)

idx_rets.columns = [f"logret_{c}" for c in idx_rets.columns]

df_reset = df_reset.merge(idx_rets, on="timestamp", how="left")
df = df_reset.sort_values(["ticker", "timestamp"]).reset_index(drop=True)
g = df.groupby("ticker", group_keys=False)
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,move_per_trade,efficiency,impact_lambda,signed_volume,dist_vwap,dist_vwap_pct,vwap_slope,vwap_slope2,logret_QQQ,logret_SPY
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0.000780,0.909091,,7436.0,0.0600,0.000315,,,0.005772,0.002633
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,0.000000,,2.789939e-07,0.0,0.0192,0.000101,-0.0492,,-0.000485,-0.000373
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,0.000714,1.000000,6.595392e-08,3189.0,0.0025,0.000013,0.0567,0.1059,0.000689,0.000438
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,0.000417,0.000000,2.681827e-07,0.0,0.0031,0.000016,0.0894,0.0327,0.000383,0.000394
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,0.001471,1.000000,4.534080e-08,1159.0,0.0206,0.000108,-0.0075,-0.0969,-0.000357,-0.000175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,0.000100,0.424913,2.798072e-09,1787242.0,1.1910,0.003083,0.6112,3.0810,0.003287,0.002177
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,0.000101,0.529870,3.352645e-09,1686153.0,1.0037,0.002582,2.3842,1.7730,0.002568,0.001867
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,0.000116,-0.198611,1.751737e-09,-1114335.0,0.7920,0.002040,-0.5483,-2.9325,-0.000299,0.000084
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,0.000115,-0.910090,7.732909e-09,-1044840.0,-1.1234,-0.002903,-1.2146,-0.6663,-0.002664,-0.002074


### **6.2 Relative return relative to QQQ/SPY/IWM**
- If a certain ticker drops more than SPY, this shows true weakness, crowding and that it is vulnerable to cascades
- If AAPL holds stead while SPY drops, this shows relative strength relative to SPY, so lower risk of cascades

In [486]:
for ticker in ["SPY", "QQQ"]:
    df[f"rel_ret_{ticker}"] = df["logret"] - df[f"logret_{ticker}"]
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,impact_lambda,signed_volume,dist_vwap,dist_vwap_pct,vwap_slope,vwap_slope2,logret_QQQ,logret_SPY,rel_ret_SPY,rel_ret_QQQ
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,7436.0,0.0600,0.000315,,,0.005772,0.002633,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,2.789939e-07,0.0,0.0192,0.000101,-0.0492,,-0.000485,-0.000373,-0.000101,0.000012
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,6.595392e-08,3189.0,0.0025,0.000013,0.0567,0.1059,0.000689,0.000438,-0.000228,-0.000479
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,2.681827e-07,0.0,0.0031,0.000016,0.0894,0.0327,0.000383,0.000394,0.000079,0.000090
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,4.534080e-08,1159.0,0.0206,0.000108,-0.0075,-0.0969,-0.000357,-0.000175,0.000228,0.000410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,2.798072e-09,1787242.0,1.1910,0.003083,0.6112,3.0810,0.003287,0.002177,0.002823,0.001714
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,3.352645e-09,1686153.0,1.0037,0.002582,2.3842,1.7730,0.002568,0.001867,0.003786,0.003085
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,1.751737e-09,-1114335.0,0.7920,0.002040,-0.5483,-2.9325,-0.000299,0.000084,-0.002036,-0.001653
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,7.732909e-09,-1044840.0,-1.1234,-0.002903,-1.2146,-0.6663,-0.002664,-0.002074,-0.006006,-0.005416


### **6.3 Rolling correlation with SPY and QQQ**
- Correlation spikes during stress/panic/liquidity withdrawl/etc...
- Ie. when correlation rises, shocks propagate more easily, cascades become systematic, not isolated

In [487]:
for ticker in ["SPY", "QQQ"]:
    df[f"corr_{ticker}"] = (
    df.groupby("ticker", group_keys=False)[["logret", f"logret_{ticker}"]]
    .apply(lambda sub: sub["logret"].rolling(50).corr(sub[f"logret_{ticker}"]))
    )
df[df.ticker == "SPY"]

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,dist_vwap,dist_vwap_pct,vwap_slope,vwap_slope2,logret_QQQ,logret_SPY,rel_ret_SPY,rel_ret_QQQ,corr_SPY,corr_QQQ
246381,SPY,2023-11-27 09:35:00+00:00,2023-11-27,454.23,454.23,454.23,454.230,720.0,454.2475,16,...,-0.0175,-0.000039,,,-0.000489,,,,,
246382,SPY,2023-11-27 09:40:00+00:00,2023-11-27,454.17,454.19,454.17,454.190,1098.0,454.1870,14,...,0.0030,0.000007,-0.0605,,0.000129,-0.000088,0.0,-0.000217,,
246383,SPY,2023-11-27 09:50:00+00:00,2023-11-27,454.17,454.17,454.17,454.170,1213.0,454.1665,15,...,0.0035,0.000008,-0.0205,0.0400,-0.000360,-0.000044,0.0,0.000316,,
246384,SPY,2023-11-27 09:55:00+00:00,2023-11-27,454.11,454.11,454.00,454.000,1113.0,454.0668,26,...,-0.0668,-0.000147,-0.0997,-0.0792,-0.000360,-0.000374,0.0,-0.000014,,
246385,SPY,2023-11-27 10:00:00+00:00,2023-11-27,454.07,454.07,454.07,454.070,202.0,454.0665,10,...,0.0035,0.000008,-0.0003,0.0994,0.000360,0.000154,0.0,-0.000206,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283877,SPY,2025-11-21 15:40:00+00:00,2025-11-21,651.41,653.32,650.94,652.850,2073232.0,651.9802,36505,...,0.8698,0.001334,0.3971,2.0289,0.003287,0.002177,0.0,-0.001109,1.0,0.979783
283878,SPY,2025-11-21 15:45:00+00:00,2025-11-21,652.87,654.32,652.38,654.070,1241401.0,653.3370,21925,...,0.7330,0.001122,1.3568,0.9597,0.002568,0.001867,0.0,-0.000701,1.0,0.980885
283879,SPY,2025-11-21 15:50:00+00:00,2025-11-21,654.05,654.39,652.63,654.125,1207024.0,653.2768,18238,...,0.8482,0.001298,-0.0602,-1.4170,-0.000299,0.000084,0.0,0.000383,1.0,0.980569
283880,SPY,2025-11-21 15:55:00+00:00,2025-11-21,654.14,654.39,652.70,652.770,1420697.0,653.4283,15294,...,-0.6583,-0.001007,0.1515,0.2117,-0.002664,-0.002074,0.0,0.000590,1.0,0.980698


### **6.4 Market breadth**
- how many tickers are down this bar
- When lots of tickers have negative returns simultaneously:
    - liquidity drains everywhere
    - market makers get hit on multiple fronts
    - flows become one-sided
    - cascades become far more profitable
- high breadth -> broad selling -> dangerous
- low breadth -> isolated move -> lower cascade probability

In [488]:
# 1 = negative return, 0 = positive/neutral
df_reset["is_down"] = (df_reset["logret"] < 0).astype(int)

# breadth = number of assets down at each timestamp
breadth = df_reset.groupby("timestamp")["is_down"].sum()
df = df_reset.merge(breadth.rename("market_breadth"), on="timestamp")
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,impact_lambda,signed_volume,dist_vwap,dist_vwap_pct,vwap_slope,vwap_slope2,logret_QQQ,logret_SPY,is_down,market_breadth
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,,7436.0,0.0600,0.000315,,,0.005772,0.002633,0,1
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,2.789939e-07,0.0,0.0192,0.000101,-0.0492,,-0.000485,-0.000373,1,5
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,6.595392e-08,3189.0,0.0025,0.000013,0.0567,0.1059,0.000689,0.000438,0,1
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,2.681827e-07,0.0,0.0031,0.000016,0.0894,0.0327,0.000383,0.000394,0,0
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,4.534080e-08,1159.0,0.0206,0.000108,-0.0075,-0.0969,-0.000357,-0.000175,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,2.798072e-09,1787242.0,1.1910,0.003083,0.6112,3.0810,0.003287,0.002177,0,1
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,3.352645e-09,1686153.0,1.0037,0.002582,2.3842,1.7730,0.002568,0.001867,0,0
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,1.751737e-09,-1114335.0,0.7920,0.002040,-0.5483,-2.9325,-0.000299,0.000084,1,5
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,7.732909e-09,-1044840.0,-1.1234,-0.002903,-1.2146,-0.6663,-0.002664,-0.002074,1,8


## **7. Time-of-Day features**
- often patterns at open and close are very different from mid-session
- so adding in features that allow for the model to learn patterns in relation to time of day can be very useful


In [489]:
ts = pd.to_datetime(df["timestamp"])
minute_of_day = ts.dt.hour * 60 + ts.dt.minute
SESSION_START = 9 * 60 + 30

minute_of_session = minute_of_day - SESSION_START

df["minute_of_session"] = minute_of_session

# encode minute of the day in sin and cos, as we want the model to learn that 9:30 and 16:00 are close together, not far apart
df["minute_sin"] = np.sin(2 * np.pi * minute_of_session / 390)
df["minute_cos"] = np.cos(2 * np.pi * minute_of_session / 390)

df["is_open"]   = (minute_of_session < 30).astype(int) # first 30 min
df["is_close"]  = (minute_of_session > 360).astype(int) # last 30 min
df["is_midday"] = ((minute_of_session > 120) & (minute_of_day < 270)).astype(int) # approx 11:30 - 14:00
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,logret_QQQ,logret_SPY,is_down,market_breadth,minute_of_session,minute_sin,minute_cos,is_open,is_close,is_midday
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0.005772,0.002633,0,1,0,0.000000e+00,1.000000,1,0,0
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,-0.000485,-0.000373,1,5,10,1.604113e-01,0.987050,1,0,0
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,0.000689,0.000438,0,1,15,2.393157e-01,0.970942,1,0,0
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,0.000383,0.000394,0,0,20,3.166680e-01,0.948536,1,0,0
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,-0.000357,-0.000175,0,5,25,3.919666e-01,0.919979,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,0.003287,0.002177,0,1,370,-3.166680e-01,0.948536,0,1,0
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,0.002568,0.001867,0,0,375,-2.393157e-01,0.970942,0,1,0
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,-0.000299,0.000084,1,5,380,-1.604113e-01,0.987050,0,1,0
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,-0.002664,-0.002074,1,8,385,-8.046657e-02,0.996757,0,1,0


## **8. Tail risk and statistical regime features**

### **8.1 Rolling skew and kurtosis of returns**
- Skewness:
    - Skew tells you which tail is more likely
    - negative skew implies big downside moves more likely 
    - positive skew implies upside bursts more likely
    
- Kurtosis:
    - measurs fat tails, ie. how likely large outlier moves are
    - high kurtosis implies many tiny bars and a few massive ones
- Cascade development:
    1. compression - low volatility, small bars, high kurtosis, negative skew
    2. expansion (the crash) - sudden big bars, volatility explodes, kurtosis collapses AFTER the move

In [490]:
df["skew_20"] = g["logret"].apply(lambda s: s.rolling(20).apply(skew, raw=False))
df["kurt_20"] = g["logret"].apply(lambda s: s.rolling(20).apply(kurtosis, raw=False))
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,is_down,market_breadth,minute_of_session,minute_sin,minute_cos,is_open,is_close,is_midday,skew_20,kurt_20
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0,1,0,0.000000e+00,1.000000,1,0,0,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,1,5,10,1.604113e-01,0.987050,1,0,0,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,0,1,15,2.393157e-01,0.970942,1,0,0,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,0,0,20,3.166680e-01,0.948536,1,0,0,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,0,5,25,3.919666e-01,0.919979,1,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,0,1,370,-3.166680e-01,0.948536,0,1,0,-0.523339,0.145877
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,0,0,375,-2.393157e-01,0.970942,0,1,0,-0.439663,0.003908
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,1,5,380,-1.604113e-01,0.987050,0,1,0,-0.372107,0.012563
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,1,8,385,-8.046657e-02,0.996757,0,1,0,-0.231457,-0.385542


### **8.2 Downside volatility vs total vol**
- downside volatility just measures the volatility of negative returns

- cascades almost never happen in balanced - (up/down) volatility
- they are preceded by clusters of negative returns, not random noise
- downside volatility increases *before* realized vol fully spikes
- ever when total vol may be normal, downside vol may be rising

In [491]:
def downside_vol(s, win):
    neg = s.where(s < 0, 0)
    return np.sqrt((neg**2).rolling(win).sum())

df["downside_vol_20"] = g["logret"].apply(lambda s: downside_vol(s, 20))
df["downside_ratio_20"] = df["downside_vol_20"] / df["rv_20"]
df

Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,minute_of_session,minute_sin,minute_cos,is_open,is_close,is_midday,skew_20,kurt_20,downside_vol_20,downside_ratio_20
0,AAPL,2023-11-29 09:30:00+00:00,2023-11-29,190.1500,190.2500,190.1400,190.2500,7436.0,190.1900,141,...,0,0.000000e+00,1.000000,1,0,0,,,,
1,AAPL,2023-11-29 09:40:00+00:00,2023-11-29,190.1600,190.1600,190.1600,190.1600,1696.0,190.1408,51,...,10,1.604113e-01,0.987050,1,0,0,,,,
2,AAPL,2023-11-29 09:45:00+00:00,2023-11-29,190.1700,190.2000,190.1700,190.2000,3189.0,190.1975,42,...,15,2.393157e-01,0.970942,1,0,0,,,,
3,AAPL,2023-11-29 09:50:00+00:00,2023-11-29,190.2900,190.2900,190.2800,190.2900,1764.0,190.2869,24,...,20,3.166680e-01,0.948536,1,0,0,,,,
4,AAPL,2023-11-29 09:55:00+00:00,2023-11-29,190.2500,190.3000,190.2500,190.3000,1159.0,190.2794,34,...,25,3.919666e-01,0.919979,1,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318643,TSLA,2025-11-21 15:40:00+00:00,2025-11-21,385.5600,388.8600,384.2400,387.5231,1787242.0,386.3321,46055,...,370,-3.166680e-01,0.948536,0,1,0,-0.523339,0.145877,0.020074,0.888892
318644,TSLA,2025-11-21 15:45:00+00:00,2025-11-21,387.5851,390.2391,386.2100,389.7200,1686153.0,388.7163,39877,...,375,-2.393157e-01,0.970942,0,1,0,-0.439663,0.003908,0.020074,0.870282
318645,TSLA,2025-11-21 15:50:00+00:00,2025-11-21,389.6750,390.1000,386.5000,388.9600,1114335.0,388.1680,30916,...,380,-1.604113e-01,0.987050,0,1,0,-0.372107,0.012563,0.020169,0.871537
318646,TSLA,2025-11-21 15:55:00+00:00,2025-11-21,389.0600,389.0600,385.5109,385.8300,1044840.0,386.9534,30918,...,385,-8.046657e-02,0.996757,0,1,0,-0.231457,-0.385542,0.021693,0.886096


## **9. Label Creation for Cascades**

### **9.1 Overview**
#### 9.1.1 Definition
- A cascade is a sharp downward acceleration that happens soon after a give timestamp, with magnitude that is large relative to the recent volatility regime

- This is exactly what stop-loss chains, liquidity withdrawl, market maker retreat, and systematic deleveraging cause
- To train an ML model, a target has to be created that the model should be expected to predict

#### 9.1.2 Label
- The best label for this is the future max drawdown over the next H bars
$$ DD^{\text{min}}(t,H), = \min_{1 < h < H}[\log{\frac{P_{t+h}}{P_t}}] $$

- This is perfect for intraday cascade prediction because:

    - Even if the final price at t+H is neutral, the deepest point still count as a cascade
    - we normalise for vol, as if the market is quietly drifiting and we drop ~0.7%, that could be a cascade, but this may be negligible in a volatile market
    - Each ticker gets its own label, even though we have cross-asset features

### **9.2 Creating the labels**
- At each timestep for each ticker, the labeler needs:

    - current close price $ P_t $
    - future close prices $ P_{t+1}, P_{t+2}, ... P{t+H} $
    - past volatility estimate, eg. rv_20 feature

- Because cascades happen over a time horizon, not istantly, use this "future window" H, with typical values of H being:
    - 6 -> next 30 mins
    - 12 -> next hour

- To actual create the label, we compare the drawdown to past volatility:
$$ \text{cascade(t)} = 1, \; \text{if} \; DD_{H}(t)<-k\sigma_{20}(t) \sqrt{H} $$
- Over H bars, volatility roughly scales with:
$$ \sigma_{H} = \sigma_1\sqrt{H}$$

- Note, we need to use an instantaneous (per bar) volatility for normalisation, NOT realised volatility, this will be ~4-5x higher
    - here used a 50/50 blend of parkinson volatility and return based volatility:
    $$ \sigma_{20}(t) = std(r_{t-19}, r_{r-18}, ... , r_t) $$

**All of this essentially explains whether the price falls by an abnomal amount at any point over the next H bars**

In [492]:
g = df.groupby("ticker", group_keys=False)

# 1. Per-bar volatility over last 20 bars
df["sigma_park"] = np.sqrt(df["parkinson_20"])
df["sigma_ret_20"] = g["logret"].apply(lambda s: s.rolling(20).std())

# 2. Compute future max-drawdown over next H bars
H = 6  # 30 minutes

def fwd_max_dd(close: pd.Series, H: int) -> pd.Series:
    logp = np.log(close.values) # type: ignore
    n = len(logp)
    out = np.full(n, np.nan)
    for i in range(n - H):
        # log returns from i -> i+h for h=1..H
        window = logp[i+1:i+H+1] - logp[i]
        out[i] = window.min()  # worst cumulative loss
    return pd.Series(out, index=close.index)

df["fwd_dd_log_H"] = g["close"].apply(lambda s: fwd_max_dd(s, H))

# 3. Binary cascade label
k = 2.5
threshold = -k * 0.5*(df["sigma_park"] + df["sigma_ret_20"]) * np.sqrt(H)

df["label_cascade"] = (df["fwd_dd_log_H"] < threshold).astype(int)

# 4. Remove rows without enough history/future
df = df.dropna(subset=["sigma_park", "fwd_dd_log_H"]).copy()

print(df["label_cascade"].value_counts())
df

label_cascade
0    303768
1     14655
Name: count, dtype: int64


Unnamed: 0,ticker,timestamp,date,open,high,low,close,volume,vwap,num_trades,...,is_close,is_midday,skew_20,kurt_20,downside_vol_20,downside_ratio_20,sigma_park,sigma_ret_20,fwd_dd_log_H,label_cascade
19,AAPL,2023-11-29 11:45:00+00:00,2023-11-29,190.17,190.17,190.1700,190.1700,724.0,190.1710,15,...,0,0,,,0.000705,,0.000109,,0.000841,0
20,AAPL,2023-11-29 11:50:00+00:00,2023-11-29,190.17,190.33,190.1700,190.3300,4557.0,190.2419,114,...,0,0,1.008466,2.060713,0.000705,0.566949,0.000136,0.000285,0.000158,0
21,AAPL,2023-11-29 11:55:00+00:00,2023-11-29,190.35,190.46,190.3500,190.4600,3026.0,190.3890,83,...,0,0,1.140805,1.115498,0.000523,0.390883,0.000157,0.000296,-0.000525,0
22,AAPL,2023-11-29 12:00:00+00:00,2023-11-29,190.40,190.41,190.3600,190.3800,7801.0,190.3974,209,...,0,0,1.043105,0.955011,0.000671,0.483859,0.000159,0.000314,-0.000105,0
23,AAPL,2023-11-29 12:05:00+00:00,2023-11-29,190.40,190.40,190.3000,190.3600,6329.0,190.3526,173,...,0,0,1.343495,2.059277,0.000679,0.519351,0.000174,0.000299,0.000210,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318637,TSLA,2025-11-21 15:10:00+00:00,2025-11-21,396.74,396.74,393.1700,394.6700,1280346.0,394.8083,33391,...,0,0,-0.871999,1.734404,0.015427,0.854699,0.004676,0.004066,-0.023275,0
318638,TSLA,2025-11-21 15:15:00+00:00,2025-11-21,394.67,395.59,391.8000,393.1700,1596182.0,393.5169,41481,...,0,0,-0.703923,1.390391,0.015890,0.861890,0.004836,0.004107,-0.019467,0
318639,TSLA,2025-11-21 15:20:00+00:00,2025-11-21,393.23,393.41,388.8595,392.7800,2391546.0,390.9001,60210,...,0,0,-0.660986,1.396418,0.015921,0.862498,0.005075,0.004094,-0.018475,0
318640,TSLA,2025-11-21 15:25:00+00:00,2025-11-21,392.87,393.67,387.7900,388.2063,1751632.0,390.2215,44965,...,0,0,-0.738800,0.521802,0.019617,0.902866,0.005439,0.004735,-0.006762,0


### **9.3 Analysis of Labelling results**
#### 9.3.1 Future max drawdown quantiles
- Most 30 minute windows (H=6) show tiny drawdowns of -0.09%
- Only 1-5% show > 1% drops
- Only 1% show > 2.4% drops

- From research, this seems to be fairly typical for U.S. equities in normal regimes

In [493]:
print("fwd_dd_log_H quantiles:")
print(df["fwd_dd_log_H"].quantile([0.01, 0.05, 0.1, 0.5]))

fwd_dd_log_H quantiles:
0.01   -0.023822
0.05   -0.010603
0.10   -0.006541
0.50   -0.000928
Name: fwd_dd_log_H, dtype: float64


#### 9.3.2 Threshold quantiles
- median threshold is -0.87% meaning half of bars need more than ~0.9% of drawdown in the next 30 mins to be labelled a cascade

- 10% threshold is -2.49% -> corresponds closely to worst drawdowns seen

In [494]:
print("threshold quantiles:")
print(threshold.quantile([0.01, 0.05, 0.1, 0.5]))

threshold quantiles:
0.01   -0.053206
0.05   -0.032031
0.10   -0.024925
0.50   -0.008660
dtype: float64


#### 9.3.3 Distrubition of labels across different tickers
- We can see a very consistent split of cascade/not cascade labels

- This shows how our volatility normalisation is working perfectly to classify different levels of cascades for different tickers

In [495]:
df.groupby("ticker")["label_cascade"].mean().sort_values()

ticker
PLTR    0.043320
IWM     0.044423
SPY     0.044909
AAPL    0.044995
NVDA    0.045128
QQQ     0.045524
TSLA    0.047379
AMD     0.049217
AMZN    0.049884
Name: label_cascade, dtype: float64

### **9.4 Final Comments**
#### 9.4.1 Machine Learning Modelling
- When fed into an actual model, we will be training the model to accurately predict this label given past information

- This means that at each timestep, we will produce an output of the form P(cascade in the next 20 mins)

#### 9.4.2 Tuning the labeller
- Currently there are 3 possible parameters to tune in this model:

    1. k (severity multiple) -> smaller k = more cascades detected (thresholds relaxed)
    2. H (horizon)  -> large H = more cascades (more chance for a drawdown if longer forward looking window)
    3. $\sigma$ type (ret, park or blend) -> pure ret gives more events, pure park gives fewer

- As is, the parameters give ~4.6% cascades, which is an idea label set for a machine learning model


## **10. Saving the feature-engineered dataframe**

In [496]:
print(os.getcwd())

/Users/alecmitchell-thomson/Desktop/Alpha Fund Bootcamp/Project/Alpha-Fund-Project/notebooks


In [497]:
out_path = "labelled_features.csv"
df.to_csv(out_path)

In [498]:
df.columns

Index(['ticker', 'timestamp', 'date', 'open', 'high', 'low', 'close', 'volume',
       'vwap', 'num_trades', 'missing_bars_day', 'logret', 'logret_2',
       'logret_3', 'logret_5', 'logret_10', 'body', 'upper_wick', 'lower_wick',
       'hl_range', 'rv_5', 'rv_10', 'rv_20', 'rv_50', 'parkinson_bar',
       'parkinson_20', 'atr_5', 'atr_50', 'compression_atr', 'volume_ma_5',
       'volume_z_5', 'volume_ma_20', 'volume_z_20', 'volume_ma_50',
       'volume_z_50', 'trades_ma_5', 'trades_z_5', 'trades_ma_20',
       'trades_z_20', 'price_move_abs', 'volume_per_trade', 'move_per_volume',
       'move_per_trade', 'efficiency', 'impact_lambda', 'signed_volume',
       'dist_vwap', 'dist_vwap_pct', 'vwap_slope', 'vwap_slope2', 'logret_QQQ',
       'logret_SPY', 'is_down', 'market_breadth', 'minute_of_session',
       'minute_sin', 'minute_cos', 'is_open', 'is_close', 'is_midday',
       'skew_20', 'kurt_20', 'downside_vol_20', 'downside_ratio_20',
       'sigma_park', 'sigma_ret_20', 'fwd_dd