In [5]:
import numpy as np
import pandas as pd
import yfinance as yf

# Universe
PAIR_LIST = [
    # Energy
    ("XOM", "CVX"),
    ("COP", "EOG"),
    ("SLB", "HAL"),
    ("PSX", "VLO"),
    ("KMI", "WMB"),
    # Materials
    ("LIN", "APD"),
    ("FCX", "SCCO"),
    ("NUE", "STLD"),
    ("DOW", "LYB"),
    # Industrials
    ("CAT", "DE"),
    ("UNP", "CSX"),
    ("UPS", "FDX"),
    ("LMT", "NOC"),
    ("GE", "EMR"),
    ("ETN", "ROK"),
    # Consumer
    ("HD", "LOW"),
    ("MCD", "YUM"),
    ("NKE", "LULU"),
    ("GM", "F"),
    ("WMT", "COST"),
    ("KO", "PEP"),
    ("PG", "CL"),
    # Health / Financials / Tech / Telecom
    ("JNJ", "PFE"),
    ("UNH", "HUM"),
    ("JPM", "BAC"),
    ("GS", "MS"),
    ("MA", "V"),
    ("NVDA", "AMD"),
    ("AVGO", "QCOM"),
    ("TXN", "ADI"),
    ("T", "VZ"),
]


# Date ranges
START_DATE = "2016-12-01"
TRAIN_START = "2018-01-01"
TRAIN_END   = "2022-12-31"
TEST_START  = "2023-01-01"
TEST_END    = "2024-12-31"

# Strategy params
ROLL_BETA_WIN    = 252   # hedge ratio lookback
ROLL_FEAT_WIN    = 60    # rolling feature window
Z_ENTRY_DAILY    = 2.0   # |z| score trigger for daily entries
MAX_HOLD_DAYS    = 10
WAIT_AFTER_FLIP_DAYS = 3

COST_PER_LEG = 0.0013    # 13 bps per leg per entry/exit

# Microstructure params
Z_ENTRY_INTRADAY = 1.5   # |z| trigger intraday (looser to get sample)
FORWARD_MINUTES  = 15    # forward evaluation horizon for intraday PnL

# Windows for OU-style features
HL_WIN   = 20   # fast mean-reversion horizon
ZVOL_WIN = 60   # zscore/vol horizon

In [6]:
# - survivorship bias: only current tickers in PAIR_LIST
# - auto_adjust=True back-adjusts with future splits/divs
#   => historical prices are slightly "hindsight cleaned"
#   => reported OOS stats are optimistic vs true live trading

def download_daily_adjusted(ticker, start_date, end_date):
    try:
        df = yf.download(
            ticker, start=start_date, end=end_date, auto_adjust=True, progress=False
        )
    except Exception:
        df = pd.DataFrame()

    if df is None or df.empty:
        return pd.DataFrame()

    out = df[["Close", "Volume"]].copy()
    out.columns = ["Close", "Volume"]
    return out


def calc_rolling_beta(logA, logB, lookback):
    idx = logA.index
    betas = np.full(len(idx), np.nan)

    for i in range(lookback - 1, len(idx)):
        y = logA.iloc[i - lookback + 1 : i + 1].values
        x = logB.iloc[i - lookback + 1 : i + 1].values
        if np.all(np.isfinite(x)) and np.all(np.isfinite(y)):
            m, b = np.polyfit(x, y, 1)  # slope m == beta
            betas[i] = m

    return pd.Series(betas, index=idx, name="beta")


def rolling_ou_features(spread):
    n = len(spread)
    idx = spread.index
    vals = spread.values

    half_life_arr = np.full(n, np.nan)
    z_arr   = np.full(n, np.nan)
    absz_arr= np.full(n, np.nan)
    vol_arr = np.full(n, np.nan)

    for i in range(n):
        # fast half-life on HL_WIN
        if i >= HL_WIN - 1:
            w = vals[i - HL_WIN + 1 : i + 1]
            if np.all(np.isfinite(w)):
                w_mean = w.mean()
                y = w - w_mean

                y_lag = y[:-1]
                y_now = y[1:]
                denom = np.sum(y_lag**2)

                if denom > 0:
                    phi = np.sum(y_lag * y_now) / denom
                else:
                    phi = np.nan

                if np.isfinite(phi):
                    if phi < 0:
                        # hl = 1.0  # overshoot => effectively sub-1-day snapback
                        hl = np.nan
                    elif 0 < phi < 1:
                        phi_clip = np.clip(phi, 1e-6, 1 - 1e-6)
                        hl = -np.log(2.0) / np.log(phi_clip)
                    else:
                        hl = np.nan
                else:
                    hl = np.nan

                half_life_arr[i] = hl

        # zscore/vol on ZVOL_WIN
        if i >= ZVOL_WIN - 1:
            w2 = vals[i - ZVOL_WIN + 1 : i + 1]
            if np.all(np.isfinite(w2)):
                m2 = w2.mean()
                s2 = w2.std(ddof=1)
                zval = (w2[-1] - m2) / s2 if s2 > 0 else np.nan
                z_arr[i] = zval
                absz_arr[i] = np.abs(zval)

                dX = np.diff(w2)
                vol_arr[i] = np.std(dX, ddof=1) if len(dX) > 1 else np.nan

    out = pd.DataFrame(
        {
            "half_life_fast": half_life_arr,
            "zscore": z_arr,
            "abs_z": absz_arr,
            "spread_vol": vol_arr,
        },
        index=idx,
    )
    return out


def build_pair_dataframe(tickerA, tickerB, start_date, end_date):
    # align A,B daily data, compute beta, spread, compute OU-style features
    dfA = download_daily_adjusted(tickerA, start_date, end_date)
    dfB = download_daily_adjusted(tickerB, start_date, end_date)

    if dfA.empty or dfB.empty:
        return None

    idx_all = dfA.index.union(dfB.index).sort_values()
    out = pd.DataFrame(index=idx_all)
    out["close_A"] = dfA["Close"].reindex(idx_all)
    out["vol_A"]   = dfA["Volume"].reindex(idx_all)
    out["close_B"] = dfB["Close"].reindex(idx_all)
    out["vol_B"]   = dfB["Volume"].reindex(idx_all)

    # require both legs to have price
    out = out.dropna(subset=["close_A", "close_B"])
    if len(out) < max(ROLL_BETA_WIN + 10, ROLL_FEAT_WIN + 10):
        return None

    out[["vol_A", "vol_B"]] = out[["vol_A", "vol_B"]].ffill()

    out["logA"] = np.log(out["close_A"])
    out["logB"] = np.log(out["close_B"])

    out["beta"] = calc_rolling_beta(out["logA"], out["logB"], ROLL_BETA_WIN)
    out["spread"] = out["logA"] - out["beta"] * out["logB"]

    ou_df = rolling_ou_features(out["spread"])
    out = out.join(ou_df)

    return out


In [7]:
pair_data = {}
dropped_pairs = []

for A, B in PAIR_LIST:
    key = f"{A}-{B}"
    df_pair = build_pair_dataframe(A, B, START_DATE, TEST_END)
    if df_pair is None or df_pair.empty:
        dropped_pairs.append((A, B))
        continue
    pair_data[key] = {"tickers": (A, B), "df": df_pair}

kept_pairs = list(pair_data.keys())

print("Kept pairs ({}):".format(len(kept_pairs)), kept_pairs)
print("Dropped pairs:", dropped_pairs)

Kept pairs (31): ['XOM-CVX', 'COP-EOG', 'SLB-HAL', 'PSX-VLO', 'KMI-WMB', 'LIN-APD', 'FCX-SCCO', 'NUE-STLD', 'DOW-LYB', 'CAT-DE', 'UNP-CSX', 'UPS-FDX', 'LMT-NOC', 'GE-EMR', 'ETN-ROK', 'HD-LOW', 'MCD-YUM', 'NKE-LULU', 'GM-F', 'WMT-COST', 'KO-PEP', 'PG-CL', 'JNJ-PFE', 'UNH-HUM', 'JPM-BAC', 'GS-MS', 'MA-V', 'NVDA-AMD', 'AVGO-QCOM', 'TXN-ADI', 'T-VZ']
Dropped pairs: []


In [8]:
import pickle
DATA_CACHE_FILE = "pair_data___.pkl"

# after building pair_data the first time:
with open(DATA_CACHE_FILE, "wb") as f:
    pickle.dump(pair_data, f)
