In [3]:
#This is the code for Table4.5 in Chapter4.3.2:The implementation and analysis of the strategy
#After modifying the dataset file path, it can be directly reproduce the results.

#!!! IMPORTANT REMINDER !!! 
#!!! IMPORTANT REMINDER !!! 
#!!! IMPORTANT REMINDER !!! 
# Before running this code, make sure to modify the input dataset path in the 1. Load CSV & print column names 

import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM
import matplotlib.pyplot as plt

# -- 1. Load CSV & print column names --

#!!! IMPORTANT REMINDER !!! 
#!!! IMPORTANT REMINDER !!! 
#!!! IMPORTANT REMINDER !!! 
# !!! Modify "C:/Users/ZhangYinhang/ES_F_data.csv"  to your save path in the below !!!
df = pd.read_csv("C:/Users/ZhangYinhang/ES_F_data.csv")
print("Columns in your CSV:\n", df.columns.tolist())

# -- 2. Automatically detect open/close columns, exclude Adjusted Close --
open_cols  = [c for c in df.columns if "open"  in c.lower()]
close_cols = [c for c in df.columns if "close" in c.lower()]

if not open_cols:
    raise ValueError("No column containing 'open' found in CSV.")
if not close_cols:
    raise ValueError("No column containing 'close' found in CSV.")

non_adj_close = [c for c in close_cols if "adj" not in c.lower()]
close_col = non_adj_close[0] if non_adj_close else close_cols[0]
open_col  = open_cols[0]

print("Using Open column :", open_col)
print("Using Close column:", close_col)

# -- 3. Rename columns & preprocess --
date_cols = [c for c in df.columns if "date" in c.lower()]
if not date_cols:
    raise ValueError("No column containing 'date' found in CSV.")
date_col = date_cols[0]
print("Using Date column :", date_col)

df = df.rename(columns={
    open_col:  "Open",
    close_col: "Close",
    date_col:  "Date"
})
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# Compute log returns
df["log_ret"] = np.log(df["Close"]) - np.log(df["Close"].shift(1))

# -- 4. Split training and test sets --
train_start = "2022-06-23"
train_end   = "2024-12-23"
test_start  = pd.to_datetime(train_end) + pd.Timedelta(days=1)

df_train = df[(df["Date"] >= train_start) & (df["Date"] <= train_end)].copy()
df_test  = df[df["Date"] >= test_start].copy()

obs_train = df_train["log_ret"].dropna().values.reshape(-1, 1)

# -- 5. Fit 3-state Gaussian HMM --
model = GaussianHMM(n_components=3, covariance_type="full",
                    n_iter=1000, random_state=123)
model.fit(obs_train)

# -- 6. Extract transition matrix --
Tmat = model.transmat_

# -- 7. Rolling online decoding & backtest --
obs_hist = list(obs_train.flatten())
n_test   = len(df_test)
signals  = np.zeros(n_test, dtype=int)
rets     = np.zeros(n_test)

for i, row in enumerate(df_test.itertuples()):
    o = getattr(row, "Open")
    c = getattr(row, "Close")

    new_ret = np.log(c) - np.log(o)
    obs_hist.append(new_ret)

    X      = np.array(obs_hist).reshape(-1,1)
    states = model.predict(X)
    s_prev = states[-1]

    pred_s = np.argmax(Tmat[s_prev, :])
    signals[i] =  1 if pred_s==0 else -1 if pred_s==2 else 0
    rets[i]    = signals[i] * (c/o - 1)

out = df_test.reset_index(drop=True).copy()
out["signal"] = signals
out["ret"]    = rets
out = out.dropna(subset=["ret"])

# -- 8. Performance evaluation function & window comparison --
def eval_window(df, n_days):
    dt      = df.iloc[:n_days]
    wins    = (dt["ret"] > 0).sum()
    losses  = (dt["ret"] <= 0).sum()
    winrate = wins / (wins + losses) if (wins + losses)>0 else 0.0
    eq      = (1 + dt["ret"]).cumprod()
    maxdd   = -((eq - eq.cummax()) / eq.cummax()).min()
    cumret  = eq.iloc[-1] - 1
    return {
        "Window":          f"{n_days} days",
        "CumulativeReturn": f"{cumret*100:.2f}%",
        "MaxDrawdown":     f"{maxdd*100:.2f}%",
        "WinRate":         f"{winrate*100:.2f}%",
        "Wins":            int(wins),
        "Losses":          int(losses)
    }

# Compute performance for 20, 40, 60, 80, 100-day windows
res_20   = eval_window(out,  20)
res_40   = eval_window(out,  40)
res_60   = eval_window(out,  60)
res_80   = eval_window(out,  80)
res_100  = eval_window(out, 100)
res   = pd.DataFrame([res_20, res_40, res_60, res_80, res_100])

# Reorder columns: Window, CumulativeReturn, MaxDrawdown, WinRate, Wins, Losses
res = res[["Window", "CumulativeReturn", "MaxDrawdown", "WinRate", "Wins", "Losses"]]

print("\nBacktest performance comparison (up to 100 days):")
print(res)


Model is not converging.  Current: 2024.9045023773301 is not greater than 2024.905449905886. Delta is -0.0009475285557982716


Columns in your CSV:
 ['Date', 'Adj Close_ES=F', 'Close_ES=F', 'High_ES=F', 'Low_ES=F', 'Open_ES=F', 'Volume_ES=F']
Using Open column : Open_ES=F
Using Close column: Close_ES=F
Using Date column : Date

Backtest performance comparison (up to 100 days):
     Window CumulativeReturn MaxDrawdown WinRate  Wins  Losses
0   20 days            1.12%       4.16%  50.00%    10      10
1   40 days            1.36%       4.16%  52.50%    21      19
2   60 days           -7.05%      11.50%  46.67%    28      32
3   80 days          -26.78%      29.18%  46.25%    37      43
4  100 days          -16.90%      29.18%  54.00%    54      46
