In [None]:
import numpy as np
import pandas as pd

MODEL_NAME = "RandomForest"  # or "Logit"

def american_to_decimal(amer):
    """American odds -> decimal odds."""
    if pd.isna(amer):
        return np.nan
    amer = float(amer)
    if amer > 0:
        return 1.0 + (amer / 100.0)
    else:
        return 1.0 + (100.0 / abs(amer))

def implied_prob_from_american(amer):
    """Implied prob from American odds (no vig removal)."""
    if pd.isna(amer):
        return np.nan
    amer = float(amer)
    if amer > 0:
        return 100.0 / (amer + 100.0)
    else:
        return abs(amer) / (abs(amer) + 100.0)

def kelly_fraction(p, dec_odds):
    """
    Kelly fraction for a 1-unit stake with decimal odds.
    b = dec_odds - 1
    f* = (b*p - (1-p)) / b
    """
    if pd.isna(p) or pd.isna(dec_odds):
        return 0.0
    b = dec_odds - 1.0
    if b <= 0:
        return 0.0
    f = (b * p - (1.0 - p)) / b
    return max(0.0, f)

def max_drawdown(equity):
    """equity: array-like of bankroll over time"""
    eq = np.asarray(equity, dtype=float)
    peak = np.maximum.accumulate(eq)
    dd = (eq - peak) / peak
    return dd.min()  # negative number

def summarize_backtest(df_bets, equity_curve, starting_bankroll):
    n = len(df_bets)
    if n == 0:
        return {
            "n_bets": 0,
            "roi": 0.0,
            "profit": 0.0,
            "ending_bankroll": starting_bankroll,
            "max_drawdown": 0.0,
            "hit_rate": np.nan,
            "avg_edge": np.nan,
            "avg_stake": np.nan,
        }

    profit = equity_curve[-1] - starting_bankroll
    roi = profit / starting_bankroll
    hit_rate = (df_bets["pnl"] > 0).mean()
    return {
        "n_bets": int(n),
        "roi": float(roi),
        "profit": float(profit),
        "ending_bankroll": float(equity_curve[-1]),
        "max_drawdown": float(max_drawdown(equity_curve)),
        "hit_rate": float(hit_rate),
        "avg_edge": float(df_bets["edge"].mean()),
        "avg_stake": float(df_bets["stake"].mean()),
    }


In [None]:
ODDS_CSV = "nhl_odds_2023_2024_closing.csv"  # adjust path if needed
odds = pd.read_csv(ODDS_CSV)

# Try to find key columns
cols = {c.lower(): c for c in odds.columns}

def need(*names):
    for n in names:
        if n.lower() in cols:
            return cols[n.lower()]
    return None

c_game = need("game_id", "gamepk", "gamePk")
c_home = need("home_odds", "homeOdds", "home_ml", "home_moneyline")
c_away = need("away_odds", "awayOdds", "away_ml", "away_moneyline")
c_date = need("date", "game_date", "gameDate")

print("Detected columns:")
print(" game:", c_game)
print(" home:", c_home)
print(" away:", c_away)
print(" date:", c_date)

if c_game is None or c_home is None or c_away is None:
    raise ValueError(f"Odds CSV missing required columns. Found: {list(odds.columns)}")

odds_small = odds[[c_game, c_home, c_away] + ([c_date] if c_date else [])].copy()
odds_small = odds_small.rename(columns={c_game: "game_id", c_home: "home_odds", c_away: "away_odds"})
if c_date:
    odds_small = odds_small.rename(columns={c_date: "date"})
else:
    odds_small["date"] = np.nan

# Ensure game_id string for safe merging
odds_small["game_id"] = odds_small["game_id"].astype(str)

print("Odds rows:", len(odds_small))
odds_small.head()


In [None]:
# Pick which model to backtest
# MODEL_NAME = "RandomForest"  # or "Logit"
p_home = preds[MODEL_NAME]   # predicted prob home win on test set

# df_test must include game_id + homeWin + gameDate (or date)
# Adjust these if your column names differ
df_bt = df_test[["game_id", "gameDate", "homeWin"]].copy()
df_bt["game_id"] = df_bt["game_id"].astype(str)
df_bt["p_home"] = p_home
df_bt["p_away"] = 1.0 - df_bt["p_home"]

# Merge odds onto test games
df_bt = df_bt.merge(odds_small, on="game_id", how="inner")

# Convert odds -> decimal + implied
df_bt["home_dec"] = df_bt["home_odds"].map(american_to_decimal)
df_bt["away_dec"] = df_bt["away_odds"].map(american_to_decimal)

df_bt["home_imp"] = df_bt["home_odds"].map(implied_prob_from_american)
df_bt["away_imp"] = df_bt["away_odds"].map(implied_prob_from_american)

print("Backtest games after merge:", len(df_bt))
df_bt.head()


In [None]:
START_BANKROLL = 1000.0
KELLY_SCALE = 0.25          # 1/4 Kelly
MIN_EDGE = 0.02             # e.g. require 2% edge vs implied
MAX_STAKE_FRAC = 0.05       # optional cap: max 5% bankroll per bet

# Expected value per $1 stake:
# EV = p * dec_odds - 1
df_bt["ev_home"] = df_bt["p_home"] * df_bt["home_dec"] - 1.0
df_bt["ev_away"] = df_bt["p_away"] * df_bt["away_dec"] - 1.0

# Pick the better side per game
df_bt["side"] = np.where(df_bt["ev_home"] >= df_bt["ev_away"], "home", "away")
df_bt["ev"] = np.where(df_bt["side"] == "home", df_bt["ev_home"], df_bt["ev_away"])

# "Edge" vs implied (more intuitive): p - implied_p
df_bt["edge"] = np.where(
    df_bt["side"] == "home",
    df_bt["p_home"] - df_bt["home_imp"],
    df_bt["p_away"] - df_bt["away_imp"],
)

# Only bet when edge >= MIN_EDGE and odds are valid
df_bt["bet"] = (df_bt["edge"] >= MIN_EDGE) & np.isfinite(df_bt["ev"])

# Kelly sizing based on chosen side's odds
df_bt["dec"] = np.where(df_bt["side"] == "home", df_bt["home_dec"], df_bt["away_dec"])
df_bt["p"] = np.where(df_bt["side"] == "home", df_bt["p_home"], df_bt["p_away"])
df_bt["kelly_f"] = df_bt.apply(lambda r: kelly_fraction(r["p"], r["dec"]), axis=1)
df_bt["stake_frac"] = np.minimum(MAX_STAKE_FRAC, KELLY_SCALE * df_bt["kelly_f"])
df_bt.loc[~df_bt["bet"], "stake_frac"] = 0.0

# Simulate bankroll chronologically
df_bt = df_bt.sort_values("gameDate").reset_index(drop=True)

bankroll = START_BANKROLL
equity = [bankroll]
bet_rows = []

for r in df_bt.itertuples(index=False):
    stake = bankroll * float(r.stake_frac)
    if stake <= 0:
        equity.append(bankroll)
        continue

    # outcome: homeWin is 1 if home won else 0
    home_won = int(r.homeWin) == 1

    if r.side == "home":
        win = home_won
    else:
        win = not home_won

    if win:
        pnl = stake * (float(r.dec) - 1.0)     # profit only (stake returned implicitly)
    else:
        pnl = -stake

    bankroll += pnl
    equity.append(bankroll)

    bet_rows.append({
        "game_id": r.game_id,
        "gameDate": r.gameDate,
        "side": r.side,
        "p": float(r.p),
        "dec": float(r.dec),
        "edge": float(r.edge),
        "stake": float(stake),
        "pnl": float(pnl),
        "bankroll": float(bankroll),
    })

df_bets = pd.DataFrame(bet_rows)

summary = summarize_backtest(df_bets, equity, START_BANKROLL)
summary


In [None]:
import matplotlib.pyplot as plt

eq = np.array(equity, dtype=float)
peak = np.maximum.accumulate(eq)
dd = (eq - peak) / peak

plt.figure(figsize=(8,5))
plt.plot(eq)
plt.title(f"Equity Curve â€” {MODEL_NAME} | min_edge={MIN_EDGE:.3f} | {int(KELLY_SCALE*100)}% Kelly")
plt.xlabel("Step")
plt.ylabel("Bankroll")
plt.show()

plt.figure(figsize=(8,5))
plt.plot(dd)
plt.title("Drawdown")
plt.xlabel("Step")
plt.ylabel("Drawdown")
plt.show()

print("Summary:", summary)
print("Bets placed:", len(df_bets))
display(df_bets.head(10))
