# 01 - Naive Flat-Stake Backtest

Replicates flat Home/Draw/Away strategies on EPL using closing odds (or best available) and reports ROI and EV with no-vig probabilities.

## Data inputs
- Source: Football-Data.co.uk CSVs for EPL (2016/17 onward).
- Closing odds preferred: `PS*` or `*C` columns. Fallback: Bet365 then Avg/Max, flagged in `odds_source`.
- Results column: `FTR` (H/D/A). Update `RESULT_COL` below if your files differ.

In [7]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT_ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = PROJECT_ROOT / "data" / "raw"
sys.path.append(str(PROJECT_ROOT / "src"))
from football_betting.odds import compute_no_vig_probs

pd.options.display.float_format = "{:.4f}".format

## Load EPL data
Adjust the glob or parsing logic if your filenames differ. CSVs are expected in `data/raw/`.

In [8]:
def load_epl_data(data_dir: Path = DATA_DIR) -> pd.DataFrame:
    paths = sorted(data_dir.glob("*.csv"))
    if not paths:
        raise FileNotFoundError("Add Football-Data.co.uk CSVs to data/raw (e.g., E0_2025-26.csv).")

    frames = []
    for path in paths:
        df = pd.read_csv(path)
        df["source_file"] = path.name
        frames.append(df)

    data = pd.concat(frames, ignore_index=True)

    date_cols = [c for c in ["Date", "DateTime", "Kickoff"] if c in data.columns]
    if date_cols:
        data["match_date"] = pd.to_datetime(data[date_cols[0]], errors="coerce")
    else:
        data["match_date"] = pd.NaT

    def infer_season(file_name: str) -> str:
        stem = Path(file_name).stem
        for token in stem.replace("-", "_").split("_"):
            if token and token[0].isdigit() and len(token) >= 4:
                return token
        return "unknown"

    data["season"] = data["source_file"].apply(infer_season)
    return data


raw = load_epl_data()
raw.head()

  data["match_date"] = pd.to_datetime(data[date_cols[0]], errors="coerce")


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA,source_file,match_date,season
0,E0,15/08/2025,20:00,Liverpool,Bournemouth,4,2,H,1,0,...,1.85,2.03,1.88,1.94,1.76,2.14,1.86,E0_2025-26.csv,2025-08-15,2025
1,E0,16/08/2025,12:30,Aston Villa,Newcastle,0,0,D,0,0,...,1.89,2.06,1.8,1.95,1.74,2.14,1.86,E0_2025-26.csv,2025-08-16,2025
2,E0,16/08/2025,15:00,Brighton,Fulham,1,1,D,0,0,...,2.0,1.84,2.03,1.8,1.96,1.91,2.08,E0_2025-26.csv,2025-08-16,2025
3,E0,16/08/2025,15:00,Sunderland,West Ham,3,0,H,0,0,...,1.95,1.95,1.94,1.86,1.78,2.02,1.97,E0_2025-26.csv,2025-08-16,2025
4,E0,16/08/2025,15:00,Tottenham,Burnley,3,0,H,1,0,...,1.93,1.98,1.91,1.88,1.83,2.07,1.92,E0_2025-26.csv,2025-08-16,2025


## Build market odds with closing preference and compute no-vig probabilities
Priority: Pinnacle closing (`PS*` or `PS* C`), then Bet365 closing, Max/Avg closing, then non-closing. The chosen source is recorded in `odds_source`.

In [9]:
ODDS_PRIORITY = [
    ("PSCH", "PSCD", "PSCA"),
    ("B365CH", "B365CD", "B365CA"),
    ("MaxCH", "MaxCD", "MaxCA"),
    ("AvgCH", "AvgCD", "AvgCA"),
    ("PSH", "PSD", "PSA"),
    ("B365H", "B365D", "B365A"),
    ("MaxH", "MaxD", "MaxA"),
    ("AvgH", "AvgD", "AvgA"),
]


def select_market_odds(df: pd.DataFrame) -> pd.DataFrame:
    records = []
    for _, row in df.iterrows():
        selected = (np.nan, np.nan, np.nan, None)
        for cols in ODDS_PRIORITY:
            h_col, d_col, a_col = cols
            if (
                h_col in row
                and d_col in row
                and a_col in row
                and pd.notna(row[h_col])
                and pd.notna(row[d_col])
                and pd.notna(row[a_col])
            ):
                selected = (row[h_col], row[d_col], row[a_col], h_col)
                break
        records.append(selected)

    out = pd.DataFrame(records, columns=["odds_home", "odds_draw", "odds_away", "odds_source"])
    return pd.concat([df.reset_index(drop=True), out], axis=1)


def add_no_vig_probs(df: pd.DataFrame) -> pd.DataFrame:
    probs = df.apply(
        lambda r: compute_no_vig_probs(r["odds_home"], r["odds_draw"], r["odds_away"])
        if pd.notna(r["odds_home"]) and pd.notna(r["odds_draw"]) and pd.notna(r["odds_away"])
        else {"pH": np.nan, "pD": np.nan, "pA": np.nan},
        axis=1,
        result_type="expand",
    )
    df[["pH", "pD", "pA"]] = probs
    return df


market = add_no_vig_probs(select_market_odds(raw))
market[["source_file", "odds_source", "odds_home", "odds_draw", "odds_away", "pH", "pD", "pA"]].head()

Unnamed: 0,source_file,odds_source,odds_home,odds_draw,odds_away,pH,pD,pA
0,E0_2025-26.csv,PSCH,1.29,6.55,9.75,0.7523,0.1482,0.0995
1,E0_2025-26.csv,PSCH,2.32,3.63,3.07,0.4176,0.2669,0.3156
2,E0_2025-26.csv,PSCH,1.92,3.66,4.27,0.5065,0.2657,0.2278
3,E0_2025-26.csv,PSCH,3.55,3.24,2.29,0.2743,0.3005,0.4252
4,E0_2025-26.csv,PSCH,1.56,4.2,6.7,0.6233,0.2315,0.1451


## Backtest flat Home/Draw/Away strategies
Uses stake = 100 by default to align with EV definition. ROI is P&L divided by total stakes.

In [10]:
RESULT_COL = "FTR"

PICK_MAP = {
    "H": ("odds_home", "pH"),
    "D": ("odds_draw", "pD"),
    "A": ("odds_away", "pA"),
}


def evaluate_flat_strategy(df: pd.DataFrame, pick: str, stake: float = 1.0) -> dict:
    odds_col, prob_col = PICK_MAP[pick]
    subset = df.dropna(subset=[odds_col, prob_col, RESULT_COL])
    if subset.empty:
        return {"pick": pick, "n_bets": 0, "pnl": np.nan, "roi": np.nan, "ev": np.nan}

    returns = np.where(subset[RESULT_COL] == pick, subset[odds_col] - 1, -1) * stake
    pnl = returns.sum()
    n_bets = len(subset)
    roi = pnl / (n_bets * stake)
    ev = (subset[prob_col] * (subset[odds_col] - 1) - (1 - subset[prob_col])).mean() * stake
    return {"pick": pick, "n_bets": n_bets, "pnl": pnl, "roi": roi, "ev": ev}


def summarize_flat_strategies(df: pd.DataFrame, stake: float = 1.0) -> pd.DataFrame:
    rows = [evaluate_flat_strategy(df, pick, stake) for pick in ("H", "D", "A")]
    return pd.DataFrame(rows).sort_values("pick")


flat_results = summarize_flat_strategies(market, stake=100.0)
flat_results

Unnamed: 0,pick,n_bets,pnl,roi,ev
2,A,150,-1464.0,-0.0976,-2.8762
1,D,150,-2865.0,-0.191,-2.8762
0,H,150,1724.0,0.1149,-2.8762


## Persist processed market dataset
Saved for downstream notebooks to avoid repeating the same parsing.

In [None]:
processed_path = PROJECT_ROOT / "data" / "processed" / "market_epl.parquet"
processed_path.parent.mkdir(parents=True, exist_ok=True)
market.to_parquet(processed_path, index=False)
processed_path

ArrowKeyError: No type extension with name arrow.py_extension_type found