# Statistical Algos

## Notebook's Environment

In [None]:
INSTALL_DEPS = False
if INSTALL_DEPS:
  %pip install matplotlib==3.8.3
  %pip installnumpy==1.26.4
  %pip installpandas==2.2.1
  %pip installpandas_market_calendars==4.4.0
  %pip installpytz==2024.1
  %pip installscipy==1.12.0
  %pip installta==0.11.0
  %pip installyfinance==0.2.37

!python --version

## Cloud Environment Setup

In [None]:
import os
import sys
import warnings

warnings.filterwarnings("ignore")

IN_KAGGLE = IN_COLAB = False
try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive

    drive.mount("/content/drive")
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "/content/drive/MyDrive/models"
    IN_COLAB = True
    print("Colab!")
except:
    IN_COLAB = False
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ and not IN_COLAB:
    print("Running in Kaggle...")
    for dirname, _, filenames in os.walk("/kaggle/input"):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    MODEL_PATH = "./models"
    DATA_PATH = "/kaggle/input/futures_1min_data.csv"
    IN_KAGGLE = True
    print("Kaggle!")
elif not IN_COLAB:
    IN_KAGGLE = False
    MODEL_PATH = "./models"
    DATA_PATH = "./data/futures_1min_data.csv"
    print("running localhost!")

## Data

In [None]:
import pandas as pd
import numpy as np

futs_df = pd.read_csv(DATA_PATH, index_col="Date", parse_dates=True)

print(futs_df.shape)
print(futs_df.columns)
futs_df.head(2)

In [None]:
class YFinanceOptions:
    INDEX = "Datetime"
    MIN1_RANGE = 7 - 1
    MIN15_RANGE = 60 - 1
    HOUR_RANGE = 730 - 1
    DAY_RANGE = 7300 - 1
    D1="1d"
    H1="1h"
    M15="15m"
    M1="1m"
    DATE_TIME_FORMAT = "%Y-%m-%d"
    DATE_TIME_HRS_FORMAT = '%Y-%m-%d %H:%M:%S %Z'

INTERVAL = YFinanceOptions.M15

SNP_FUT = "ES=F"
NSDQ_FUT = "NQ=F"
GOLD_FUT = "GC=F"
CRUDOIL_FUT="CL=F"
VOLATILITY_FUT= "^VIX"
RUS_FUT = "RTY=F"
RATES_FUT = "2YY=F"

CORN_FUT = "ZC=F"
SOYOIL_FUT = "ZL=F"
KCWHEAT_FUT = "KE=F"
SOYBEAN_FUT = "ZS=F"
SOYBEANMEAL_FUT = "ZM=F"
WHEAT_FUT = "ZW=F"
LIVECATTLE_FUT = "LE=F"
LEANHOG_FUT = "HE=F"
FEEDERCATTLE_FUT = "GF=F"
MILK_FUT = "DC=F"

TARGET_FUT=CORN_FUT.replace("=F", "")

# Problem Defintion

Utilize meduim-frequency trade data for a set of 6 Future contracts listed on CME during a 250-day trading periods between 01/01/2023 and 01/01/2024. Future contracts are cross-sectional from metals, equities and volatiliyy. Trading is 23hours, with minute durations.

## Intra-Day Mean Reversion

https://learning.oreilly.com/library/view/algorithmic-trading-winning/9781118746912/OEBPS/9781118746912_epub_c02.htm#c02-sec1-0001

Using Augmented Dickey-Fuller (ADF) for stationarity:

$\Delta y(t) = \alpha + \beta t + \gamma y(t-1) + \delta_1 \Delta y(t-1) + \cdots + \delta_{p-1} \Delta y(t-p+1) + \epsilon_t$

where:
- $\Delta y(t) = y(t) - y(t-1)$ represents the first difference of the series.
- $y(t-1)$ is the lagged value of the series.
- $\beta t$, often set to zero in price series analyses, accounts for any deterministic time trend.
- $\delta_1, \delta_2, ..., \delta_{p-1}$ are coefficients for the lagged differences, adjusting for serial correlation.
- $\epsilon_t$ is the error term.


The null hypothesis $H_0: \gamma = 0$ suggests the presence of a unit root, indicating non-stationarity if the test statistic is $< 0.5$.

$\text{Test Statistic} = \frac{\hat{\gamma}}{\text{SE}(\hat{\gamma})}$

where $\text{SE}(\hat{\gamma})$ is the standard error of $\hat{\gamma}$. 

Given a significance of $\alpha=0.05$, having critical values of $-2.86 \text{ to} -3.45$.
- $\text{SE} < 0 $ and $ p < 0.05 = \text{Mean reverting}$
- $\text{SE} > 0 $ and $ p < 0.05 = \text{Trending}$

From our EDA:
| Series     | Lag | Coefficient | P-Value          |
|------------|-----|-------------|------------------|
| NQ_Close   | 1   | 0.999967    | 0.000000e+00     |
| NQ_Close   | 5   | -0.017068   | 8.802106e-30     |

A the 5 min lag, the series shows a **weak** mean-reversion $-0.017068 > -2.86$.
At the 1 min lag it trends, the AR is strong towards the direction.

### Mean Reversion Probability

In [None]:
nq_fut_df = futs_df[[f"{TARGET_FUT}_Close"]].copy()
nq_fut_df.index = pd.to_datetime(nq_fut_df.index)
nq_fut_df.sort_index(inplace=True)
# nq_fut_df = nq_fut_df.resample('15T').mean().bfill()

nq_fut_df.head(2)

### Ornstein-Uhlenbeck (OU) 

Hurst: random walk (H=0.5), mean-reverting (H<0.5), or trending (H>0.5).

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from hurst import compute_Hc

log_prices = np.log(nq_fut_df[f"{TARGET_FUT}_Close"])

HURST, c, data = compute_Hc(log_prices, kind='price', simplified=True)
print(f'Hurst Exponent: {HURST}')

In [None]:
NOISE_THRESHOLD = 5.
LAGS_IN_MINS =  [1]

spread_lag = log_prices.shift(1).bfill()

spread_ret = (log_prices - spread_lag).bfill()
spread_lag2 = sm.add_constant(spread_lag)

model = sm.OLS(spread_ret, spread_lag2)
res = model.fit()
HALF_LIFE = int(round(-np.log(2) / res.params[1], 0))

HALF_LIFE

## Probabilities of Reversion

In [None]:

for lag in LAGS_IN_MINS:
    trend_col = f'T_{lag}M'
    mr_col = f'MR_{lag}M'

    nq_fut_df[trend_col] = nq_fut_df[f'{TARGET_FUT}_Close'].diff(lag).bfill()
    nq_fut_df[trend_col] = nq_fut_df[trend_col].apply(lambda x: np.sign(x))

    nq_fut_df[mr_col] = (nq_fut_df[trend_col] != nq_fut_df[trend_col].shift(lag).bfill()).astype(int)

nq_fut_df.head(5)

In [None]:
mr_proba = {}
for col in nq_fut_df.columns:
    if col.startswith('MR_'):
        mr_proba[col] = nq_fut_df[col].mean()

probabilities_df = pd.DataFrame(list(mr_proba.items()), columns=['Lag', 'Probability'])
probabilities_df['Lag'] = probabilities_df['Lag'].str.extract('(\d+)').astype(int)
probabilities_df.sort_values('Lag', inplace=True)
probabilities_df

### Markov Properties

After a certain number of consecutive trending steps (up or down) as "trending" or "mean-reverting".

1. Define trending step.
2. Calculate Conditional Probabilities
3. Markov Chain Mode from trending to not.

In [None]:
MAX_TRENDS = 15
revert_proba = {}
for lag in LAGS_IN_MINS:
    trend_reversion_prob = {}
    trend_col = f'T_{lag}M'
    mr_col = f'MR_{lag}M'

    for x in range(0, MAX_TRENDS + 1):
        rolling_trends = nq_fut_df[trend_col].rolling(window=x).sum() == x
        still_trending = rolling_trends.shift(1)

        reverted = nq_fut_df[mr_col]
        reverting_after_trends = (reverted & still_trending)
        total_trends = still_trending.sum()

        if total_trends > 0:
            trend_reversion_prob[x] = reverting_after_trends.sum() / total_trends

    revert_proba[lag] = trend_reversion_prob

assert len(revert_proba) > 0

rows = []
for lag, probs in revert_proba.items():
    for trend, probability in probs.items():
        rows.append({'Lag': lag, 'Trends': trend, 'Probability': probability})

temp_df = pd.DataFrame(rows)
pivot_df = temp_df.pivot(index='Trends', columns='Lag', values='Probability')
pivot_df = pivot_df.reindex(sorted(pivot_df.columns, key=int), axis=1)
pivot_df.T

In [None]:
import matplotlib.pyplot as plt

pivot_df.plot(kind='line', marker='o', figsize=(8, 4))

plt.title('Mean Reversion Probability after X Trends', fontsize=16)
plt.xlabel('X Trends', fontsize=14)
plt.ylabel('Mean Reversion %', fontsize=14)
plt.legend(title='Lag (in minutes)', fontsize=12, title_fontsize=14)
plt.grid(True)
plt.xticks(range(1, MAX_TRENDS + 1))
plt.show()

In [None]:
from scipy.stats import norm

distribution_params = pd.DataFrame(columns=['Lag', 'Mean', 'Std'])
for lag in pivot_df.columns:
    data = pivot_df[lag].dropna()
    if not data.empty:
        mu, std = norm.fit(data)
        new_row = pd.DataFrame({
            'Lag': [lag],
            'Mean': [mu],
            'Std': [std]
        })
        distribution_params = pd.concat([distribution_params, new_row], ignore_index=True)

        plt.figure(figsize=(8, 4))
        plt.hist(data, bins=10, density=True, alpha=0.6, color='g')
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std)
        plt.plot(x, p, 'k', linewidth=2)
        plt.title(f"Gaussian for Lag {lag}: mean = {mu:.2f}, std = {std:.2f}")
        plt.show()

distribution_params

# Bollinger Bands

In [None]:
from tqdm import tqdm
import itertools
import math

def calculate_std_factor(hurst, base_std=2.0, adjustment=0.5):
    if hurst < 0.5:
        # Increase the standard deviation factor for mean-reverting markets
        return base_std + (0.5 - hurst) * adjustment
    elif hurst > 0.5:
        # Decrease the standard deviation factor for trending markets
        return base_std - (hurst - 0.5) * adjustment
    else:
        # Use the base standard deviation factor for a random walk
        return base_std

def bollinger_band_backtest(df, target_col, window, hurst=0.5, base_std=2.0, adjustment=0.5):
    std_factor = calculate_std_factor(hurst, base_std, adjustment)

    df['MA'] = df[target_col].rolling(window=window).mean()
    df['Std_Dev'] = df[target_col].rolling(window=window).std()
    df['U'] = df['MA'] + (df['Std_Dev'] * std_factor)
    df['L'] = df['MA'] - (df['Std_Dev'] * std_factor)


    df['SB'] = (df[target_col] < df['L']).astype(int).diff().clip(0) * +1
    df['SS'] = (df[target_col] > df['U']).astype(int).diff().clip(0) * -1
    df['SBS'] = (df[target_col] > df['MA']).astype(int).diff().clip(0) * -1
    df['SSB'] = (df[target_col] < df['MA']).astype(int).diff().clip(0) * +1

    df['Position'] = 0
    df['Ret'] = 0
    entry = position = 0

    # Execute strategy
    for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="bollinger_band"):
        if (row['SB'] == 1 and position == 0) or (row['SS'] == -1 and position == 0):
            entry = row[target_col]
            position = 1 if row['SB'] == 1 else -1
        elif (row['SBS'] == -1 and position == 1) or (row['SSB'] == 1 and position == -1):
            if position == 1:
                df.loc[i, 'Ret'] = (row[target_col] - entry) / entry
            else:
                df.loc[i, 'Ret'] = (entry - row[target_col]) / entry
            position = 0
        df.loc[i, 'Position'] = position

    df['cRets'] = (1 + df['Ret']).cumprod() - 1

    variance = df['Ret'].var()
    df['Drawdown'] = (1 + df['Ret']).cumprod().div((1 + df['Ret']).cumprod().cummax()) - 1
    max_drawdown = df['Drawdown'].min()
    drawdown_length = (df['Drawdown'] < 0).astype(int).groupby(df['Drawdown'].eq(0).cumsum()).cumsum().max()
    negative_returns = df['Ret'][df['Ret'] < 0]
    sortino_ratio = df['Ret'].mean() / negative_returns.std() * np.sqrt(24192)  # 60/15 * 24 * 252 Intervals
    trades = (df['Position'].diff().ne(0) & df['Position'].ne(0)).sum()

    stats_df = pd.DataFrame({
        "Window": [window],
        "Hurst": [hurst],
        "Standard_Factor": [std_factor],
        "Cumulative_Returns": [df['cRets'].iloc[-1]],
        "Variance": [variance],
        "STD": [np.sqrt(variance)],
        "Max_Drawdown": [max_drawdown],
        "Drawdown_Length": [drawdown_length],
        "Sortino_Ratio": [sortino_ratio],
        "Trades_Count": [trades],
        "Trades_per_Interval": [trades / len(df)],
        "Trading_Intervals": [len(df)],
    })

    return df, stats_df

def param_search_bbs(df, target_col, initial_window=HALF_LIFE, intial_std_adjustment=0.5, hurst=HURST):
    windows = [initial_window // (2**i) for i in range(int(math.log(initial_window / 8, 2)) + 1)]
    std_adjustments = [intial_std_adjustment/2, intial_std_adjustment, intial_std_adjustment * 2]
    combinations = list(itertools.product(windows, std_adjustments))

    current_window = initial_window
    best_sortino = -float('inf')
    best_sortino_stats = None
    best_rets = -float('inf')
    best_rets_stats = None
    best_mdd = -float('inf')
    best_mdd_stats = None

    for window, adjustment in tqdm(combinations, desc="param_search_bbs"):
        _, stats_df = bollinger_band_backtest(df, target_col, window, hurst=hurst, adjustment=adjustment)
        stat = stats_df['Sortino_Ratio'].iloc[0]
        if stat > best_sortino:
            best_sortino = stat
            best_sortino_stats = stats_df

        stat = stats_df['Cumulative_Returns'].iloc[0]
        if stat > best_rets:
            best_rets = stat
            best_rets_stats = stats_df

        stat = stats_df['Max_Drawdown'].iloc[0]
        if stat > best_mdd:
            best_mdd = stat
            best_mdd_stats = stats_df

        current_window = current_window // 2

    results_df = pd.concat([best_sortino_stats.assign(Metric='Sortino'),
                            best_rets_stats.assign(Metric='Cumulative Returns'),
                            best_mdd_stats.assign(Metric='Max Drawdown')],
                           ignore_index=True)

    return results_df

stats_df = param_search_bbs(futs_df, f'{TARGET_FUT}_Close', HALF_LIFE, HURST)
stats_df

### Results

| Window | Hurst    | Standard_Factor | Cumulative_Returns | Variance    | STD       | Max_Drawdown | Drawdown_Length | Sortino_Ratio | Trades_Count | Trades_per_Interval | Trading_Intervals | Metric             |
|--------|----------|-----------------|--------------------|-------------|-----------|--------------|-----------------|---------------|--------------|---------------------|-------------------|--------------------|
| 34     | 0.470735 | 2.006888        | 0.465803           | 1.276604e-07| 0.000357  | -0.041815    | 8440            | 0.360678      | 613          | 0.010650            | 57560             | Sortino            |
| 34     | 0.470735 | 2.006888        | 0.465803           | 1.276604e-07| 0.000357  | -0.041815    | 8440            | 0.360678      | 613          | 0.010650            | 57560             | Cumulative Returns |
| 2193   | 0.470735 | 2.006888        | 0.150270           | 7.338430e-08| 0.000271  | -0.021146    | 7518            | 0.052484      | 30           | 0.000521            | 57560             | Max Drawdown       |


In [None]:
BEST_WINDOW = 34
BEST_STD_DJUSTMENT = 0.5

backtest_df = futs_df[[f'{TARGET_FUT}_Close']].copy()
backtest_df, stats_df = bollinger_band_backtest(backtest_df, f'{TARGET_FUT}_Close', BEST_WINDOW, hurst=HURST, adjustment=BEST_STD_DJUSTMENT)
backtest_df['cRets'] = (1 + backtest_df['Ret']).cumprod() - 1
print(f"Cumulative returns from the strategy: {backtest_df['cRets'].iloc[-1]*100.}%")

backtest_df = backtest_df.tail(1000)

plt.figure(figsize=(15, 10))
buy_signals = backtest_df[backtest_df['SB'] > 0]
sell_signals = backtest_df[backtest_df['SS'] < 0]

ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=4, colspan=1)  # Main plot gets more space
ax1.plot(backtest_df[f'{TARGET_FUT}_Close'], label=f'{TARGET_FUT} Close', color='blue', alpha=0.6)
ax1.plot(backtest_df['MA'], label='Moving Average', color='red')
ax1.plot(backtest_df['U'], label='Upper Bollinger Band', color='green')
ax1.plot(backtest_df['L'], label='Lower Bollinger Band', color='green', alpha=0.7)
ax1.scatter(buy_signals.index, buy_signals[f'{TARGET_FUT}_Close'], color='green', marker='^', label='Buy Signal')
ax1.scatter(sell_signals.index, sell_signals[f'{TARGET_FUT}_Close'], color='red', marker='v', label='Sell Signal')
ax1.set_title(f'Bollinger Bands for {TARGET_FUT} Close')
ax1.set_xlabel('Date')
ax1.set_ylabel('Price')
ax1.legend()
ax1.grid(True)

ax2 = plt.subplot2grid((6, 1), (4, 0), rowspan=2, colspan=1, sharex=ax1)
ax2.plot(backtest_df['cRets'], label='Cumulative rets', color='purple')
ax2.set_title('Cumulative rets')
ax2.set_xlabel('Date')
ax2.set_ylabel('Cumulative rets')
ax2.grid(True)

plt.tight_layout()
plt.show()