In [1]:
# Database connection
import sqlite3

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid", context="talk")

# Optimization
from scipy.optimize import linprog

# Machine learning and evaluation
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Gradient boosting
from xgboost import XGBClassifier

# Phase 1 - Build the match-level dataset

Goal:
- load `Match`, `League`, `Team`
- keep Premier League and Ligue 1
- create label `result` in {H, D, A}
- keep Bet365 odds (`B365H/D/A`)
- compute implied probabilities adjusted for margin (`b365_prob_H/D/A`)

In [2]:
# ---------- PARAMETERS ----------
DB_PATH = "database.sqlite"   # adjust if needed
LEAGUES_TO_KEEP = [
    "England Premier League",
    "France Ligue 1"
]

# ---------- CONNECTION ----------
conn = sqlite3.connect(DB_PATH)

# ---------- LOAD TABLES ----------
league_df = pd.read_sql("SELECT * FROM League", conn)
team_df   = pd.read_sql("SELECT * FROM Team", conn)
match_df  = pd.read_sql("SELECT * FROM Match", conn)

print("Available leagues:")
display(league_df[["id", "name"]])

# ---------- FILTER LEAGUES ----------
selected_leagues = league_df[league_df["name"].isin(LEAGUES_TO_KEEP)].copy()
print("Selected leagues:")
display(selected_leagues)

# Join Match and League to keep selected leagues
match_league = match_df.merge(
    selected_leagues[["id", "name"]],
    how="inner",
    left_on="league_id",
    right_on="id",
    suffixes=("", "_league")
)

# Rename for clarity
match_league.rename(columns={"name": "league_name"}, inplace=True)
match_league.drop(columns=["id_league"], inplace=True)

# ---------- CREATE H/D/A LABEL ----------
def result_from_goals(row):
    if row["home_team_goal"] > row["away_team_goal"]:
        return "H"
    elif row["home_team_goal"] < row["away_team_goal"]:
        return "A"
    else:
        return "D"

match_league["result"] = match_league.apply(result_from_goals, axis=1)

# ---------- BASE COLUMNS ----------
base_cols = [
    "id", "country_id", "league_id", "league_name",
    "season", "stage", "date",
    "match_api_id", "home_team_api_id", "away_team_api_id",
    "home_team_goal", "away_team_goal",
    "result",
    "B365H", "B365D", "B365A"
]

df = match_league[base_cols].copy()

# ---------- IMPLIED PROBABILITIES (MARGIN-ADJUSTED) ----------
for col in ["B365H", "B365D", "B365A"]:
    df[f"inv_{col}"] = 1.0 / df[col]

df["b365_inv_sum"] = df[["inv_B365H", "inv_B365D", "inv_B365A"]].sum(axis=1)

df["b365_prob_H"] = df["inv_B365H"] / df["b365_inv_sum"]
df["b365_prob_D"] = df["inv_B365D"] / df["b365_inv_sum"]
df["b365_prob_A"] = df["inv_B365A"] / df["b365_inv_sum"]

# ---------- CLEANUP ----------
df_before = len(df)
df = df.dropna(subset=["B365H", "B365D", "B365A"])
df_after = len(df)
print(f"Matches before B365 filter: {df_before}")
print(f"Matches after B365 filter: {df_after}")

df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

# ---------- PREVIEW ----------
print("\nPreview of final DataFrame (first 5 rows):")
display(df.head())

print("DataFrame columns:")
print(df.columns.tolist())

# Close connection
conn.close()

Available leagues:


Unnamed: 0,id,name
0,1,Belgium Jupiler League
1,1729,England Premier League
2,4769,France Ligue 1
3,7809,Germany 1. Bundesliga
4,10257,Italy Serie A
5,13274,Netherlands Eredivisie
6,15722,Poland Ekstraklasa
7,17642,Portugal Liga ZON Sagres
8,19694,Scotland Premier League
9,21518,Spain LIGA BBVA


Selected leagues:


Unnamed: 0,id,country_id,name
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1


Matches before B365 filter: 6080
Matches after B365 filter: 6076

Preview of final DataFrame (first 5 rows):


Unnamed: 0,id,country_id,league_id,league_name,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,...,B365H,B365D,B365A,inv_B365H,inv_B365D,inv_B365A,b365_inv_sum,b365_prob_H,b365_prob_D,b365_prob_A
0,4770,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483130,9827,7819,...,1.57,3.6,6.5,0.636943,0.277778,0.153846,1.068567,0.596072,0.259954,0.143974
1,4771,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483131,9746,9831,...,2.3,3.0,3.4,0.434783,0.333333,0.294118,1.062234,0.40931,0.313804,0.276886
2,4772,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483132,8682,8689,...,2.1,3.1,3.8,0.47619,0.322581,0.263158,1.061929,0.44842,0.303769,0.247811
3,4774,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483134,9829,9847,...,2.4,3.1,3.1,0.416667,0.322581,0.322581,1.061828,0.392405,0.303797,0.303797
4,4775,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483135,8481,8639,...,2.15,3.1,3.6,0.465116,0.322581,0.277778,1.065475,0.436534,0.302758,0.260708


DataFrame columns:
['id', 'country_id', 'league_id', 'league_name', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal', 'result', 'B365H', 'B365D', 'B365A', 'inv_B365H', 'inv_B365D', 'inv_B365A', 'b365_inv_sum', 'b365_prob_H', 'b365_prob_D', 'b365_prob_A']


# Phase 2 - Team form features

For each match and team (home/away), compute rolling stats over the last $N$ matches played **before** that match:
- average goals scored and conceded
- average points (3/1/0)
- win rate

In [3]:
matches = df.copy()
N = 5  # form window

matches_small = matches[[
    "id", "date",
    "home_team_api_id", "away_team_api_id",
    "home_team_goal", "away_team_goal",
    "result"
]].copy()

def compute_team_form(features_df, team_col, prefix):
    """
    Compute rolling form features for a team identified by `team_col`.
    Returns match id plus prefixed form columns.
    """
    rows = []

    for side in ["home", "away"]:
        if side == "home":
            team_id_col = "home_team_api_id"
            opp_id_col = "away_team_api_id"
            gf_col = "home_team_goal"
            ga_col = "away_team_goal"
        else:
            team_id_col = "away_team_api_id"
            opp_id_col = "home_team_api_id"
            gf_col = "away_team_goal"
            ga_col = "home_team_goal"

        tmp = features_df[[
            "id", "date",
            team_id_col, opp_id_col,
            gf_col, ga_col
        ]].copy()

        tmp.rename(columns={
            team_id_col: "team_id",
            opp_id_col: "opponent_id",
            gf_col: "goals_for",
            ga_col: "goals_against"
        }, inplace=True)

        rows.append(tmp)

    long_df = pd.concat(rows, ignore_index=True)

    long_df["points"] = np.where(
        long_df["goals_for"] > long_df["goals_against"], 3,
        np.where(long_df["goals_for"] < long_df["goals_against"], 0, 1)
    )
    long_df["win_flag"] = (long_df["goals_for"] > long_df["goals_against"]).astype(int)

    long_df = long_df.sort_values(["team_id", "date"]).reset_index(drop=True)

    long_df["gf_shift"] = long_df["goals_for"].shift(1)
    long_df["ga_shift"] = long_df["goals_against"].shift(1)
    long_df["pts_shift"] = long_df["points"].shift(1)
    long_df["win_shift"] = long_df["win_flag"].shift(1)

    long_df[["roll_goals_for",
             "roll_goals_against",
             "roll_points",
             "roll_win_rate"]] = (
        long_df
        .groupby("team_id")[["gf_shift", "ga_shift", "pts_shift", "win_shift"]]
        .rolling(N, min_periods=1)
        .mean()
        .reset_index(level=0, drop=True)
        .rename(columns={
            "gf_shift": "roll_goals_for",
            "ga_shift": "roll_goals_against",
            "pts_shift": "roll_points",
            "win_shift": "roll_win_rate"
        })
    )

    form = long_df[[
        "id", "team_id",
        "roll_goals_for",
        "roll_goals_against",
        "roll_points",
        "roll_win_rate"
    ]].copy()

    form.rename(columns={
        "team_id": team_col,
        "roll_goals_for": f"{prefix}_roll_goals_for",
        "roll_goals_against": f"{prefix}_roll_goals_against",
        "roll_points": f"{prefix}_roll_points",
        "roll_win_rate": f"{prefix}_roll_win_rate",
    }, inplace=True)

    return form

home_form = compute_team_form(matches_small, "home_team_api_id", "home")
away_form = compute_team_form(matches_small, "away_team_api_id", "away")

df_with_form = (
    df
    .merge(home_form, how="left", on=["id", "home_team_api_id"])
    .merge(away_form, how="left", on=["id", "away_team_api_id"])
)

print("Preview of DataFrame with form features (first 5 rows):")
display(df_with_form.head())

print("New columns added:")
new_cols = [c for c in df_with_form.columns if c not in df.columns]
print(new_cols)

Preview of DataFrame with form features (first 5 rows):


Unnamed: 0,id,country_id,league_id,league_name,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,...,b365_prob_D,b365_prob_A,home_roll_goals_for,home_roll_goals_against,home_roll_points,home_roll_win_rate,away_roll_goals_for,away_roll_goals_against,away_roll_points,away_roll_win_rate
0,4770,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483130,9827,7819,...,0.259954,0.143974,1.0,4.0,0.0,0.0,2.0,1.0,3.0,1.0
1,4771,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483131,9746,9831,...,0.313804,0.276886,1.0,0.0,3.0,1.0,0.0,4.0,0.0,0.0
2,4772,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483132,8682,8689,...,0.303769,0.247811,1.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0
3,4774,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483134,9829,9847,...,0.303797,0.303797,0.0,1.0,0.0,0.0,4.0,1.0,3.0,1.0
4,4775,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483135,8481,8639,...,0.302758,0.260708,2.0,2.0,1.0,0.0,2.0,3.0,0.0,0.0


New columns added:
['home_roll_goals_for', 'home_roll_goals_against', 'home_roll_points', 'home_roll_win_rate', 'away_roll_goals_for', 'away_roll_goals_against', 'away_roll_points', 'away_roll_win_rate']


# Phase 3 - Add `Team_Attributes`

For each match and team (home/away), attach the latest `Team_Attributes` record with `date` <= match date.
Then create `home_*`, `away_*`, and optional home-away differences.

In [4]:
matches = df_with_form.copy()
matches["date"] = pd.to_datetime(matches["date"])

# Load Team_Attributes
conn = sqlite3.connect(DB_PATH)
team_attr = pd.read_sql("SELECT * FROM Team_Attributes", conn)
conn.close()

team_attr["date"] = pd.to_datetime(team_attr["date"])

keep_cols = [
    "team_api_id", "date",
    "buildUpPlaySpeed",
    "buildUpPlayPassing",
    "chanceCreationPassing",
    "chanceCreationCrossing",
    "chanceCreationShooting",
    "defencePressure",
    "defenceAggression",
    "defenceTeamWidth"
]
team_attr_small = team_attr[keep_cols].copy()

def attach_team_attr_side(matches_df, team_attr_df, team_col, prefix):
    """
    Attach the most recent team attributes with date <= match date.
    """
    side = matches_df[["id", "date", team_col]].rename(
        columns={team_col: "team_api_id", "date": "match_date"}
    ).copy()

    merged = side.merge(team_attr_df, on="team_api_id", how="left")
    merged = merged[merged["date"] <= merged["match_date"]]

    merged.sort_values(["id", "date"], inplace=True)
    idx = merged.groupby("id")["date"].idxmax()
    best = merged.loc[idx].copy()

    attr_cols = [
        "buildUpPlaySpeed",
        "buildUpPlayPassing",
        "chanceCreationPassing",
        "chanceCreationCrossing",
        "chanceCreationShooting",
        "defencePressure",
        "defenceAggression",
        "defenceTeamWidth"
    ]
    keep = ["id"] + attr_cols
    best = best[keep].copy()

    rename_map = {
        "buildUpPlaySpeed": f"{prefix}_buildUpPlaySpeed",
        "buildUpPlayPassing": f"{prefix}_buildUpPlayPassing",
        "chanceCreationPassing": f"{prefix}_chanceCreationPassing",
        "chanceCreationCrossing": f"{prefix}_chanceCreationCrossing",
        "chanceCreationShooting": f"{prefix}_chanceCreationShooting",
        "defencePressure": f"{prefix}_defencePressure",
        "defenceAggression": f"{prefix}_defenceAggression",
        "defenceTeamWidth": f"{prefix}_defenceTeamWidth",
    }
    best.rename(columns=rename_map, inplace=True)

    return best

home_attr = attach_team_attr_side(matches, team_attr_small, "home_team_api_id", "home")
away_attr = attach_team_attr_side(matches, team_attr_small, "away_team_api_id", "away")

print("Preview home_attr:")
display(home_attr.head())

print("Preview away_attr:")
display(away_attr.head())

# Merge into matches
matches_with_attr = (
    matches
    .merge(home_attr, how="left", on="id")
    .merge(away_attr, how="left", on="id")
)

# Home-away differences
for col in [
    "buildUpPlaySpeed",
    "buildUpPlayPassing",
    "chanceCreationPassing",
    "chanceCreationCrossing",
    "chanceCreationShooting",
    "defencePressure",
    "defenceAggression",
    "defenceTeamWidth",
]:
    h_col = f"home_{col}"
    a_col = f"away_{col}"
    diff_col = f"diff_{col}"
    if h_col in matches_with_attr.columns and a_col in matches_with_attr.columns:
        matches_with_attr[diff_col] = matches_with_attr[h_col] - matches_with_attr[a_col]

print("Preview of final DataFrame with Team_Attributes (first 5 rows):")
display(matches_with_attr.head())

print("New columns added by Team_Attributes:")
new_cols_attr = [c for c in matches_with_attr.columns if c not in df_with_form.columns]
print(new_cols_attr)

Preview home_attr:


Unnamed: 0,id,home_buildUpPlaySpeed,home_buildUpPlayPassing,home_chanceCreationPassing,home_chanceCreationCrossing,home_chanceCreationShooting,home_defencePressure,home_defenceAggression,home_defenceTeamWidth
7856,2228,70,60,55,70,70,45,55,45
7731,2239,70,70,70,70,50,35,70,35
7850,2240,58,30,31,70,50,30,70,30
7607,2245,70,65,70,70,70,65,70,70
7589,2246,60,70,70,70,55,35,70,35


Preview away_attr:


Unnamed: 0,id,away_buildUpPlaySpeed,away_buildUpPlayPassing,away_chanceCreationPassing,away_chanceCreationCrossing,away_chanceCreationShooting,away_defencePressure,away_defenceAggression,away_defenceTeamWidth
7857,2228,60,65,60,70,45,40,70,40
7727,2239,70,59,65,70,50,30,70,30
7851,2240,70,70,70,70,70,70,70,70
7608,2245,65,70,70,70,55,35,70,35
7590,2246,55,70,70,70,45,35,70,35


Preview of final DataFrame with Team_Attributes (first 5 rows):


Unnamed: 0,id,country_id,league_id,league_name,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,...,away_defenceAggression,away_defenceTeamWidth,diff_buildUpPlaySpeed,diff_buildUpPlayPassing,diff_chanceCreationPassing,diff_chanceCreationCrossing,diff_chanceCreationShooting,diff_defencePressure,diff_defenceAggression,diff_defenceTeamWidth
0,4770,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483130,9827,7819,...,,,,,,,,,,
1,4771,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483131,9746,9831,...,,,,,,,,,,
2,4772,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483132,8682,8689,...,,,,,,,,,,
3,4774,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483134,9829,9847,...,,,,,,,,,,
4,4775,4769,4769,France Ligue 1,2008/2009,1,2008-08-09,483135,8481,8639,...,,,,,,,,,,


New columns added by Team_Attributes:
['home_buildUpPlaySpeed', 'home_buildUpPlayPassing', 'home_chanceCreationPassing', 'home_chanceCreationCrossing', 'home_chanceCreationShooting', 'home_defencePressure', 'home_defenceAggression', 'home_defenceTeamWidth', 'away_buildUpPlaySpeed', 'away_buildUpPlayPassing', 'away_chanceCreationPassing', 'away_chanceCreationCrossing', 'away_chanceCreationShooting', 'away_defencePressure', 'away_defenceAggression', 'away_defenceTeamWidth', 'diff_buildUpPlaySpeed', 'diff_buildUpPlayPassing', 'diff_chanceCreationPassing', 'diff_chanceCreationCrossing', 'diff_chanceCreationShooting', 'diff_defencePressure', 'diff_defenceAggression', 'diff_defenceTeamWidth']


# Phase 4 - Features, temporal split, and multinomial LogReg

Prepare data, split by season, and train a multinomial logistic regression to predict $P(H), P(D), P(A)$.

In [5]:
data = matches_with_attr.copy()

# ---------- 1) Target encoding ----------
y = data["result"].copy()

# ---------- 2) Numeric features ----------
feature_cols = [
    # Bet365 odds
    "B365H", "B365D", "B365A",
    # Market implied probabilities
    "b365_prob_H", "b365_prob_D", "b365_prob_A",
    # Team form
    "home_roll_goals_for", "home_roll_goals_against",
    "home_roll_points", "home_roll_win_rate",
    "away_roll_goals_for", "away_roll_goals_against",
    "away_roll_points", "away_roll_win_rate",
    # Team attributes
    "home_buildUpPlaySpeed", "home_buildUpPlayPassing",
    "home_chanceCreationPassing", "home_chanceCreationCrossing",
    "home_chanceCreationShooting", "home_defencePressure",
    "home_defenceAggression", "home_defenceTeamWidth",
    "away_buildUpPlaySpeed", "away_buildUpPlayPassing",
    "away_chanceCreationPassing", "away_chanceCreationCrossing",
    "away_chanceCreationShooting", "away_defencePressure",
    "away_defenceAggression", "away_defenceTeamWidth",
    # Home-away differences
    "diff_buildUpPlaySpeed", "diff_buildUpPlayPassing",
    "diff_chanceCreationPassing", "diff_chanceCreationCrossing",
    "diff_chanceCreationShooting", "diff_defencePressure",
    "diff_defenceAggression", "diff_defenceTeamWidth",
]

feature_cols = [c for c in feature_cols if c in data.columns]

X = data[feature_cols].copy()

# ---------- 3) Missing values ----------
X = X.fillna(X.median(numeric_only=True))

# ---------- 4) Temporal split ----------
def season_start_year(season_str):
    return int(season_str.split("/")[0])

data["season_start"] = data["season"].apply(season_start_year)

train_mask = data["season_start"] <= 2011
val_mask   = data["season_start"] == 2012
test_mask  = data["season_start"] >= 2013

X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Train size:", X_train.shape)
print("Val size  :", X_val.shape)
print("Test size :", X_test.shape)

# ---------- 5) Standardization ----------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# ---------- 6) Multinomial Logistic Regression ----------
logreg = LogisticRegression(solver="lbfgs", max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# ---------- 7) Evaluation ----------
proba_val = logreg.predict_proba(X_val_scaled)
proba_test = logreg.predict_proba(X_test_scaled)

y_val_pred = logreg.predict(X_val_scaled)
y_test_pred = logreg.predict(X_test_scaled)

ll_val = log_loss(y_val, proba_val, labels=logreg.classes_)
ll_test = log_loss(y_test, proba_test, labels=logreg.classes_)

acc_val = accuracy_score(y_val, y_val_pred)
acc_test = accuracy_score(y_test, y_test_pred)

print("\n=== Logistic Regression (multinomial) ===")
print("Classes:", logreg.classes_)
print(f"Validation log-loss: {ll_val:.4f}")
print(f"Test log-loss      : {ll_test:.4f}")
print(f"Validation accuracy: {acc_val:.4f}")
print(f"Test accuracy      : {acc_test:.4f}")

print("\nConfusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred, labels=logreg.classes_))

# ---------- 8) Store test probabilities ----------
proba_test_df = pd.DataFrame(
    proba_test,
    columns=[f"model_prob_{cls}" for cls in logreg.classes_],
    index=X_test.index
)

data_with_proba = data.copy()
for cls, col_name in zip(logreg.classes_, proba_test_df.columns):
    data_with_proba.loc[proba_test_df.index, col_name] = proba_test_df[col_name]

print("\nPreview of model probabilities on test matches:")
cols_show = ["season", "league_name", "date", "result"] + list(proba_test_df.columns)
display(data_with_proba[test_mask][cols_show].head())

Train size: (3037, 38)
Val size  : (760, 38)
Test size : (2279, 38)

=== Logistic Regression (multinomial) ===
Classes: ['A' 'D' 'H']
Validation log-loss: 0.9935
Test log-loss      : 0.9984
Validation accuracy: 0.5105
Test accuracy      : 0.5195

Confusion matrix (test):
[[310   2 369]
 [152   1 428]
 [142   2 873]]

Preview of model probabilities on test matches:


Unnamed: 0,season,league_name,date,result,model_prob_A,model_prob_D,model_prob_H
3797,2013/2014,France Ligue 1,2013-08-09,D,0.518068,0.227228,0.254704
3798,2013/2014,France Ligue 1,2013-08-10,H,0.213557,0.307733,0.47871
3799,2013/2014,France Ligue 1,2013-08-10,A,0.403874,0.295828,0.300297
3800,2013/2014,France Ligue 1,2013-08-10,D,0.264048,0.275661,0.460291
3801,2013/2014,France Ligue 1,2013-08-10,H,0.117419,0.358546,0.524035


# Phase 5 - Betting embedding and EV

From `data_with_proba` (test set), build a bet-level table with:
- model probability $p_{o,t}$
- bookmaker odds $c_{o,t}$
- $EV = p_{o,t} \cdot c_{o,t} - 1$

In [6]:
# Start from data_with_proba (Phase 4)
data_ev = data_with_proba.copy()

# Keep test set only
test_mask = data_ev["season_start"] >= 2013
test_df = data_ev[test_mask].copy()

print("Model probability columns:", [c for c in test_df.columns if c.startswith("model_prob_")])

# Mapping outcomes -> model probability columns
col_map = {
    "A": "model_prob_A",
    "D": "model_prob_D",
    "H": "model_prob_H",
}

# Odds columns
odds_map = {
    "A": "B365A",
    "D": "B365D",
    "H": "B365H",
}

# ---------- Build candidate bets ----------
rows = []

for outcome, prob_col in col_map.items():
    odds_col = odds_map[outcome]

    tmp = test_df[[
        "id", "date", "season", "league_name",
        "home_team_api_id", "away_team_api_id",
        "result",
        odds_col,
        prob_col
    ]].copy()

    tmp.rename(columns={
        odds_col: "odds",
        prob_col: "model_prob"
    }, inplace=True)

    tmp["outcome"] = outcome

    # EV = p * c - 1
    tmp["EV"] = tmp["model_prob"] * tmp["odds"] - 1.0

    rows.append(tmp)

bets = pd.concat(rows, ignore_index=True)

print("\nPreview of bets (first 5 rows):")
display(bets.head())

print("EV summary (test):")
print(bets["EV"].describe())

Model probability columns: ['model_prob_A', 'model_prob_D', 'model_prob_H']

Preview of bets (first 5 rows):


Unnamed: 0,id,date,season,league_name,home_team_api_id,away_team_api_id,result,odds,model_prob,outcome,EV
0,6675,2013-08-09,2013/2014,France Ligue 1,10249,9847,D,1.75,0.518068,A,-0.093381
1,6676,2013-08-10,2013/2014,France Ligue 1,9830,7794,H,3.4,0.213557,A,-0.273906
2,6670,2013-08-10,2013/2014,France Ligue 1,9827,9829,A,2.5,0.403874,A,0.009685
3,6671,2013-08-10,2013/2014,France Ligue 1,4087,9874,D,3.6,0.264048,A,-0.049426
4,6673,2013-08-10,2013/2014,France Ligue 1,8639,8689,H,6.0,0.117419,A,-0.295487


EV summary (test):
count    6837.000000
mean       -0.058125
std         0.162423
min        -0.812236
25%        -0.146138
50%        -0.048502
75%         0.038637
max         0.755790
Name: EV, dtype: float64


## 5.2 Prepare team x team matrices (market vs model)

For a given league, compute:
- $M^{market}_{ij}$: average market home-win probability for team $i$ vs $j$
- $M^{model}_{ij}$: average model home-win probability for team $i$ vs $j$

We aggregate by `(home_team_api_id, away_team_api_id)` first.

In [7]:
# Test matches only
test_df = test_df.copy()
test_df["model_prob_home"] = test_df["model_prob_H"]

pair_stats = (
    test_df
    .groupby(["league_name", "home_team_api_id", "away_team_api_id"], as_index=False)
    .agg({
        "b365_prob_H": "mean",
        "model_prob_home": "mean",
        "id": "count"
    })
    .rename(columns={"id": "num_matches"})
)

print("Preview of pair_stats (first 5 rows):")
display(pair_stats.head())

Preview of pair_stats (first 5 rows):


Unnamed: 0,league_name,home_team_api_id,away_team_api_id,b365_prob_H,model_prob_home,num_matches
0,England Premier League,8191,8197,0.369841,0.473983,1
1,England Premier League,8191,8455,0.108359,0.096473,1
2,England Premier League,8191,8456,0.12973,0.225524,1
3,England Premier League,8191,8466,0.177121,0.222743,1
4,England Premier League,8191,8472,0.390011,0.462554,1


# Phase 6 - Betting strategies and optimization

Value-bet rules (applied to `bets`):
1. R1: $EV > 0$ (flat stake).
2. R2: $EV > EV_{min}$ (e.g., 0.05).
3. R3: only the market favorite per match, then $EV > 0$.
4. R4: home-only with $EV > 0$ (optional).

Next: backtest flat-stake bankroll, then a simple linear optimization on selected bets.

In [8]:
# Start from bets (Phase 5.1)
bets_all = bets.copy()

# ---------- R1: EV > 0 ----------
value_R1 = bets_all[bets_all["EV"] > 0].copy()
print("R1 - Number of value bets (EV>0):", len(value_R1))

# ---------- R2: EV > EV_MIN ----------
EV_MIN = 0.05
value_R2 = bets_all[bets_all["EV"] > EV_MIN].copy()
print("R2 - Number of value bets (EV>0.05):", len(value_R2))

# ---------- R3: only match favorite ----------
bets_all["is_favorite"] = bets_all.groupby("id")["odds"].transform(
    lambda s: s == s.min()
)

favorites = bets_all[bets_all["is_favorite"]].copy()
value_R3 = favorites[favorites["EV"] > 0].copy()
print("R3 - Number of favorite value bets (EV>0):", len(value_R3))

print("\nPreview of value_R1 (first 5 rows):")
display(value_R1.head())

R1 - Number of value bets (EV>0): 2407
R2 - Number of value bets (EV>0.05): 1525
R3 - Number of favorite value bets (EV>0): 973

Preview of value_R1 (first 5 rows):


Unnamed: 0,id,date,season,league_name,home_team_api_id,away_team_api_id,result,odds,model_prob,outcome,EV
2,6670,2013-08-10,2013/2014,France Ligue 1,9827,9829,A,2.5,0.403874,A,0.009685
11,6783,2013-08-17,2013/2014,France Ligue 1,9831,9851,H,3.2,0.333422,A,0.066951
15,6788,2013-08-17,2013/2014,France Ligue 1,9941,9827,D,3.25,0.366234,A,0.190261
17,6785,2013-08-17,2013/2014,France Ligue 1,9837,8639,H,2.1,0.47894,A,0.005773
22,3634,2013-08-17,2013/2014,England Premier League,9850,8668,D,2.4,0.461569,A,0.107766


## 3. Team x Team matrices $M^{market}$ and $M^{model}$

Using `pair_stats`, build square matrices for one league:
- $M^{market}[i,j]$: average market home-win probability
- $M^{model}[i,j]$: average model home-win probability

In [9]:
pairs = pair_stats.copy()

def build_matrices_for_league(pairs_df, league_name):
    """
    Build M_market and M_model for a given league.
    Rows/columns are ordered by sorted team_api_id.
    """
    league_pairs = pairs_df[pairs_df["league_name"] == league_name].copy()
    if league_pairs.empty:
        raise ValueError(f"No data for league: {league_name}")

    teams = pd.unique(
        pd.concat([
            league_pairs["home_team_api_id"],
            league_pairs["away_team_api_id"]
        ], ignore_index=True)
    )
    teams = np.sort(teams)

    n = len(teams)
    M_market = np.full((n, n), np.nan)
    M_model  = np.full((n, n), np.nan)
    count_mat = np.zeros((n, n), dtype=int)

    team_to_idx = {team_id: idx for idx, team_id in enumerate(teams)}

    for _, row in league_pairs.iterrows():
        i = team_to_idx[row["home_team_api_id"]]
        j = team_to_idx[row["away_team_api_id"]]

        M_market[i, j] = row["b365_prob_H"]
        M_model[i, j]  = row["model_prob_home"]
        count_mat[i, j] = row["num_matches"]

    M_market_df = pd.DataFrame(M_market, index=teams, columns=teams)
    M_model_df  = pd.DataFrame(M_model, index=teams, columns=teams)
    count_df    = pd.DataFrame(count_mat, index=teams, columns=teams)

    return M_market_df, M_model_df, count_df

# Example: France Ligue 1
league_name = "France Ligue 1"
M_market_L1, M_model_L1, count_L1 = build_matrices_for_league(pairs, league_name)

print("Market matrix (France Ligue 1):")
display(M_market_L1.head())

print("Model matrix (France Ligue 1):")
display(M_model_L1.head())

print("Match count matrix (France Ligue 1):")
display(count_L1.head())

Market matrix (France Ligue 1):


Unnamed: 0,4087,6391,7794,7819,8121,8550,8576,8588,8592,8639,...,9831,9837,9847,9851,9853,9873,9874,9941,10242,10249
4087,,,0.480098,0.394153,,0.394402,0.543394,0.448534,0.25426,0.270854,...,0.467899,0.451121,0.107223,0.373552,0.248012,0.436534,0.436534,0.369839,,0.37237
6391,,,0.500255,0.391244,0.415704,,,,0.264403,0.298995,...,0.279661,0.464441,0.12651,0.27112,0.28777,,,0.306709,0.57002,0.396166
7794,0.441762,0.452718,,0.370865,0.382716,0.430888,0.53922,0.482239,0.273015,0.277684,...,0.410026,0.419297,0.102658,0.358081,0.256661,0.447154,0.418836,0.343729,0.518263,0.364903
7819,0.540711,0.544629,0.488809,,0.529265,0.499881,,0.475951,0.281561,0.329347,...,0.440605,0.44751,0.140846,0.35462,0.326809,,,0.426574,0.634039,0.427315
8121,,0.437415,0.454046,0.38247,,,,,0.279829,0.329412,...,0.413495,0.38247,0.095068,0.330396,0.307785,,,0.201425,0.454046,0.32669


Model matrix (France Ligue 1):


Unnamed: 0,4087,6391,7794,7819,8121,8550,8576,8588,8592,8639,...,9831,9837,9847,9851,9853,9873,9874,9941,10242,10249
4087,,,0.451096,0.414975,,0.375741,0.538458,0.427785,0.280417,0.281872,...,0.427554,0.426867,0.133896,0.377924,0.280132,0.434243,0.460291,0.373965,,0.392373
6391,,,0.481415,0.441704,0.467267,,,,0.268651,0.413878,...,0.310771,0.471565,0.115235,0.34791,0.313515,,,0.3302,0.564559,0.40603
7794,0.423205,0.470723,,0.360528,0.416573,0.4008,0.558785,0.469259,0.282675,0.303894,...,0.374366,0.406878,0.097234,0.375358,0.27,0.467895,0.437131,0.354063,0.48926,0.365157
7819,0.520404,0.506497,0.467003,,0.532288,0.464859,,0.45016,0.308326,0.375009,...,0.43148,0.438567,0.154264,0.385705,0.332116,,,0.42194,0.645899,0.405722
8121,,0.44956,0.408857,0.392849,,,,,0.214493,0.399666,...,0.394849,0.39803,0.076405,0.356274,0.336226,,,0.21518,0.458773,0.339742


Match count matrix (France Ligue 1):


Unnamed: 0,4087,6391,7794,7819,8121,8550,8576,8588,8592,8639,...,9831,9837,9847,9851,9853,9873,9874,9941,10242,10249
4087,0,0,2,1,0,1,1,1,2,2,...,2,2,2,2,2,1,1,2,0,2
6391,0,0,1,1,1,0,0,0,1,1,...,1,1,1,1,1,0,0,1,1,1
7794,2,1,0,2,1,1,1,1,3,3,...,3,3,3,3,3,1,1,3,1,3
7819,1,1,2,0,1,1,0,1,2,2,...,2,2,2,2,2,0,0,2,1,2
8121,0,1,1,1,0,0,0,0,1,1,...,1,1,1,1,1,0,0,1,1,1


Interpretation:
- `M_market_L1.loc[i, j]`: market home-win probability for team `i` vs `j`.
- `M_model_L1.loc[i, j]`: model home-win probability.
- `count_L1.loc[i, j]`: number of observed matches.

Next: backtest simple strategies (R1/R2) and solve a linear allocation on the same bets.

In [10]:
def backtest_flat_stake(bets_subset, stake=1.0, initial_bankroll=100.0):
    """
    bets_subset must contain:
    - 'result' (actual outcome: 'H','D','A')
    - 'outcome' (bet outcome)
    - 'odds' (decimal odds)
    """
    df = bets_subset.copy().reset_index(drop=True)
    bankroll = initial_bankroll
    bankroll_history = [bankroll]

    gains = []

    for _, row in df.iterrows():
        if row["result"] == row["outcome"]:
            gain = stake * (row["odds"] - 1.0)
        else:
            gain = -stake

        bankroll += gain
        bankroll_history.append(bankroll)
        gains.append(gain)

    bankroll_history = np.array(bankroll_history)
    gains = np.array(gains)

    n_bets = len(df)
    total_gain = bankroll - initial_bankroll
    roi = total_gain / initial_bankroll if initial_bankroll > 0 else np.nan

    cum_max = np.maximum.accumulate(bankroll_history)
    drawdown = (bankroll_history - cum_max) / cum_max
    max_drawdown = drawdown.min()

    return {
        "n_bets": n_bets,
        "initial_bankroll": initial_bankroll,
        "final_bankroll": bankroll,
        "total_gain": total_gain,
        "ROI": roi,
        "avg_gain_per_bet": gains.mean() if n_bets > 0 else np.nan,
        "max_drawdown": max_drawdown,
        "bankroll_history": bankroll_history,
    }

# ---------- Backtest R1 and R2 ----------
bt_R1 = backtest_flat_stake(value_R1, stake=1.0, initial_bankroll=100.0)
bt_R2 = backtest_flat_stake(value_R2, stake=1.0, initial_bankroll=100.0)

print("=== Backtest R1 (EV>0, flat stake 1) ===")
for k, v in bt_R1.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

print("\n=== Backtest R2 (EV>0.05, flat stake 1) ===")
for k, v in bt_R2.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

=== Backtest R1 (EV>0, flat stake 1) ===
n_bets: 2407
initial_bankroll: 100.0
final_bankroll: -85.62000000000016
total_gain: -185.62000000000018
ROI: -1.8562000000000018
avg_gain_per_bet: -0.07711674283340257
max_drawdown: -1.9185358986391379

=== Backtest R2 (EV>0.05, flat stake 1) ===
n_bets: 1525
initial_bankroll: 100.0
final_bankroll: 5.189999999999854
total_gain: -94.81000000000014
ROI: -0.9481000000000015
avg_gain_per_bet: -0.06217049180327869
max_drawdown: -1.0322057064753132


## 5. Linear optimization (convex portfolio)

Select stakes $w_i$ for a subset of value bets to maximize expected value.
Constraints: total budget $\sum_i w_i \le B$, and per-bet cap $0 \le w_i \le w_{max}$.

In [11]:
# Example: value bets R2 in France Ligue 1
opt_universe = value_R2[value_R2["league_name"] == "France Ligue 1"].copy()

print("Optimization universe size (R2, Ligue 1):", len(opt_universe))
display(opt_universe.head())

Optimization universe size (R2, Ligue 1): 607


Unnamed: 0,id,date,season,league_name,home_team_api_id,away_team_api_id,result,odds,model_prob,outcome,EV
11,6783,2013-08-17,2013/2014,France Ligue 1,9831,9851,H,3.2,0.333422,A,0.066951
15,6788,2013-08-17,2013/2014,France Ligue 1,9941,9827,D,3.25,0.366234,A,0.190261
47,6889,2013-08-25,2013/2014,France Ligue 1,8576,9831,D,2.9,0.373045,A,0.081832
229,6715,2013-11-09,2013/2014,France Ligue 1,9847,9831,H,15.0,0.079775,A,0.196618
263,6732,2013-11-30,2013/2014,France Ligue 1,8689,9831,H,4.2,0.301305,A,0.265481


### 5.2 Solve with `linprog`

We maximize $c^T w$ with $c_i = EV_i$, so we minimize $-c^T w$.
Constraints: $\sum_i w_i \le B$ and $0 \le w_i \le w_{max}$.

In [12]:
# Keep required columns only
opt_df = opt_universe[["id", "date", "odds", "EV"]].copy().reset_index(drop=True)

n = len(opt_df)
if n == 0:
    print("\nNo bets available for optimization.")
else:
    EV_vec = opt_df["EV"].values

    # Minimize -EV^T w
    c = -EV_vec

    # Sum of stakes <= budget
    BUDGET = 100.0
    A_ub = np.ones((1, n))
    b_ub = np.array([BUDGET])

    # Bounds per bet
    W_MAX = 5.0
    bounds = [(0.0, W_MAX)] * n

    res = linprog(
        c=c,
        A_ub=A_ub, b_ub=b_ub,
        bounds=bounds,
        method="highs"
    )

    if not res.success:
        print("\nLinear optimization did not converge:", res.message)
    else:
        w_opt = res.x
        total_stake = w_opt.sum()
        expected_return = (EV_vec * w_opt).sum()

        print("=== Linear optimization result (Ligue 1, R2) ===")
        print("Number of bets:", n)
        print("Total budget:", BUDGET)
        print("Sum of chosen stakes:", total_stake)
        print("Expected total gain (sum EV_i * w_i):", expected_return)

        opt_df["w_opt"] = w_opt
        top_bets = opt_df.sort_values("w_opt", ascending=False).head(10)
        print("\nTop 10 bets by optimal stake:")
        display(top_bets)

=== Linear optimization result (Ligue 1, R2) ===
Number of bets: 607
Total budget: 100.0
Sum of chosen stakes: 100.0
Expected total gain (sum EV_i * w_i): 36.96656467678757

Top 10 bets by optimal stake:


Unnamed: 0,id,date,odds,EV,w_opt
160,7589,2016-02-02,9.0,0.315494,5.0
172,7630,2016-02-27,4.2,0.360405,5.0
162,7596,2016-02-03,21.0,0.332274,5.0
535,7340,2015-05-09,5.0,0.320344,5.0
112,7762,2015-09-13,8.5,0.47119,5.0
205,7002,2013-09-15,3.4,0.394075,5.0
420,6740,2013-12-04,8.5,0.345872,5.0
37,7415,2014-09-27,7.0,0.322312,5.0
189,7710,2016-04-24,4.2,0.321983,5.0
134,7808,2015-11-04,4.2,0.360934,5.0


**Quick takeaway**
- Simple value-bet rules are unprofitable on this dataset, despite decent classification accuracy.
- This is a useful business insight: accuracy does not imply betting profitability.

**What to report**
- LogReg: log-loss ~1.00, accuracy ~0.52; EV distribution is negative on average.
- R1/R2 backtests: strongly negative ROI and large drawdowns.
- Linear optimization allocates to the largest EVs but relies on model EV being correct.

# Phase 7 - Model improvements

In [13]:
data = matches_with_attr.copy()
data["season_start"] = data["season"].apply(lambda s: int(s.split("/")[0]))

# Labels as strings
y_str = data["result"]

# Encode labels for XGBoost
le = LabelEncoder()
y_enc = le.fit_transform(y_str)
print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# Features without odds (baseline)
feature_no_odds = [
    "home_roll_goals_for", "home_roll_goals_against",
    "home_roll_points", "home_roll_win_rate",
    "away_roll_goals_for", "away_roll_goals_against",
    "away_roll_points", "away_roll_win_rate",
    "home_buildUpPlaySpeed", "home_buildUpPlayPassing",
    "home_chanceCreationPassing", "home_chanceCreationCrossing",
    "home_chanceCreationShooting", "home_defencePressure",
    "home_defenceAggression", "home_defenceTeamWidth",
    "away_buildUpPlaySpeed", "away_buildUpPlayPassing",
    "away_chanceCreationPassing", "away_chanceCreationCrossing",
    "away_chanceCreationShooting", "away_defencePressure",
    "away_defenceAggression", "away_defenceTeamWidth",
    "diff_buildUpPlaySpeed", "diff_buildUpPlayPassing",
    "diff_chanceCreationPassing", "diff_chanceCreationCrossing",
    "diff_chanceCreationShooting", "diff_defencePressure",
    "diff_defenceAggression", "diff_defenceTeamWidth",
]

feature_no_odds = [c for c in feature_no_odds if c in data.columns]

# Features with odds (market signals)
feature_with_odds = feature_no_odds + [
    "B365H", "B365D", "B365A",
    "b365_prob_H", "b365_prob_D", "b365_prob_A",
]
feature_with_odds = [c for c in feature_with_odds if c in data.columns]

def make_splits(feature_cols):
    X = data[feature_cols].copy()
    X = X.fillna(X.median(numeric_only=True))

    train_mask = data["season_start"] <= 2011
    val_mask   = data["season_start"] == 2012
    test_mask  = data["season_start"] >= 2013

    return (
        X[train_mask], y_enc[train_mask],
        X[val_mask],   y_enc[val_mask],
        X[test_mask],  y_enc[test_mask],
        test_mask
    )

Class mapping: {'A': np.int64(0), 'D': np.int64(1), 'H': np.int64(2)}


### 1.2 XGBoost without odds vs with odds

In [14]:
def eval_xgb(feature_cols, label):
    X_train, y_train, X_val, y_val, X_test, y_test, test_mask = make_splits(feature_cols)

    xgb = XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        eval_metric="mlogloss",
        tree_method="hist",
        max_depth=4,
        n_estimators=300,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    xgb.fit(X_train, y_train)

    proba_val = xgb.predict_proba(X_val)
    proba_test = xgb.predict_proba(X_test)
    y_val_pred = xgb.predict(X_val)
    y_test_pred = xgb.predict(X_test)

    classes_enc = xgb.classes_
    classes_str = le.inverse_transform(classes_enc)

    ll_val = log_loss(y_val, proba_val, labels=classes_enc)
    ll_test = log_loss(y_test, proba_test, labels=classes_enc)
    acc_val = accuracy_score(y_val, y_val_pred)
    acc_test = accuracy_score(y_test, y_test_pred)

    print(f"=== XGBoost ({label}) ===")
    print("Encoded classes:", classes_enc, "->", classes_str)
    print(f"Validation log-loss: {ll_val:.4f}")
    print(f"Test log-loss      : {ll_test:.4f}")
    print(f"Validation accuracy: {acc_val:.4f}")
    print(f"Test accuracy      : {acc_test:.4f}")

    return xgb, (X_train, y_train, X_val, y_val, X_test, y_test, test_mask), proba_test

# 1) without odds
xgb_no_odds, splits_no_odds, proba_test_no_odds = eval_xgb(feature_no_odds, "no odds")

# 2) with odds
xgb_with_odds, splits_with_odds, proba_test_with_odds = eval_xgb(feature_with_odds, "with odds")

=== XGBoost (no odds) ===
Encoded classes: [0 1 2] -> ['A' 'D' 'H']
Validation log-loss: 1.1013
Test log-loss      : 1.0750
Validation accuracy: 0.4434
Test accuracy      : 0.4546
=== XGBoost (with odds) ===
Encoded classes: [0 1 2] -> ['A' 'D' 'H']
Validation log-loss: 1.0462
Test log-loss      : 1.0319
Validation accuracy: 0.4987
Test accuracy      : 0.4976


## 2. XGBoost calibration (isotonic)

Calibrate the XGBoost model (with odds) using CV on train+val, then evaluate on test.

In [15]:
# ---------- 1) Splits for calibration ----------
data = matches_with_attr.copy()
data["season_start"] = data["season"].apply(lambda s: int(s.split("/")[0]))

y_enc_all = y_enc

feature_with_odds = [c for c in feature_with_odds if c in data.columns]
X_all = data[feature_with_odds].copy()
X_all = X_all.fillna(X_all.median(numeric_only=True))

train_mask = data["season_start"] <= 2011
val_mask   = data["season_start"] == 2012
test_mask  = data["season_start"] >= 2013

trainval_mask = train_mask | val_mask

X_trainval = X_all[trainval_mask]
y_trainval = y_enc_all[trainval_mask]

X_test = X_all[test_mask]
y_test = y_enc_all[test_mask]

print("Train+val size:", X_trainval.shape)
print("Test size     :", X_test.shape)

# ---------- 2) Base XGBoost ----------
xgb_base = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    max_depth=4,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

calib = CalibratedClassifierCV(
    estimator=xgb_base,
    method="isotonic",
    cv=3
)

calib.fit(X_trainval, y_trainval)

# ---------- 3) Test evaluation ----------
proba_test_cal = calib.predict_proba(X_test)
y_test_pred_cal = calib.predict(X_test)

classes_enc = calib.classes_
classes_str = le.inverse_transform(classes_enc)

ll_test_cal = log_loss(y_test, proba_test_cal, labels=classes_enc)
acc_test_cal = accuracy_score(y_test, y_test_pred_cal)

print("\n=== XGBoost + isotonic calibration (train+val) ===")
print("Encoded classes:", classes_enc, "->", classes_str)
print(f"Test log-loss (calibrated): {ll_test_cal:.4f}")
print(f"Test accuracy (calibrated): {acc_test_cal:.4f}")

Train+val size: (3797, 38)
Test size     : (2279, 38)

=== XGBoost + isotonic calibration (train+val) ===
Encoded classes: [0 1 2] -> ['A' 'D' 'H']
Test log-loss (calibrated): 1.0156
Test accuracy (calibrated): 0.4901


## 3. Rebuild bets and backtests with calibrated XGB

In [16]:
# ---------- 2.1 Inject calibrated probabilities ----------
data_xgb = data.copy()

proba_test_df_xgb = pd.DataFrame(
    proba_test_cal,
    columns=[f"xgb_cal_prob_{cls}" for cls in classes_enc],
    index=X_test.index
)

enc_to_label = dict(zip(classes_enc, classes_str))
rename_cols = {
    f"xgb_cal_prob_{enc}": f"xgb_cal_prob_{enc_to_label[enc]}"
    for enc in classes_enc
}
proba_test_df_xgb.rename(columns=rename_cols, inplace=True)

for col in proba_test_df_xgb.columns:
    data_xgb.loc[proba_test_df_xgb.index, col] = proba_test_df_xgb[col]

print("Preview calibrated XGB probabilities on test matches:")
cols_show = ["season", "league_name", "date", "result"] + list(proba_test_df_xgb.columns)
display(data_xgb[test_mask][cols_show].head())

# ---------- 2.2 Build bets_xgb ----------
test_df_xgb = data_xgb[test_mask].copy()

col_map_xgb = {
    "A": "xgb_cal_prob_A",
    "D": "xgb_cal_prob_D",
    "H": "xgb_cal_prob_H",
}
odds_map = {
    "A": "B365A",
    "D": "B365D",
    "H": "B365H",
}

rows = []
for outcome, prob_col in col_map_xgb.items():
    odds_col = odds_map[outcome]
    tmp = test_df_xgb[[
        "id", "date", "season", "league_name",
        "home_team_api_id", "away_team_api_id",
        "result",
        odds_col,
        prob_col
    ]].copy()
    tmp.rename(columns={odds_col: "odds", prob_col: "model_prob"}, inplace=True)
    tmp["outcome"] = outcome
    tmp["EV"] = tmp["model_prob"] * tmp["odds"] - 1.0
    rows.append(tmp)

bets_xgb = pd.concat(rows, ignore_index=True)

print("EV summary (XGB calibrated):")
print(bets_xgb["EV"].describe())

# ---------- 2.3 Value bets and backtests ----------
value_R1_xgb = bets_xgb[bets_xgb["EV"] > 0].copy()
value_R2_xgb = bets_xgb[bets_xgb["EV"] > 0.05].copy()

print("\nNumber of XGB value bets (EV>0):", len(value_R1_xgb))
print("Number of XGB value bets (EV>0.05):", len(value_R2_xgb))

bt_R1_xgb = backtest_flat_stake(value_R1_xgb, stake=1.0, initial_bankroll=100.0)
bt_R2_xgb = backtest_flat_stake(value_R2_xgb, stake=1.0, initial_bankroll=100.0)

print("\n=== Backtest XGB calibrated R1 (EV>0) ===")
for k, v in bt_R1_xgb.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

print("\n=== Backtest XGB calibrated R2 (EV>0.05) ===")
for k, v in bt_R2_xgb.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

Preview calibrated XGB probabilities on test matches:


Unnamed: 0,season,league_name,date,result,xgb_cal_prob_A,xgb_cal_prob_D,xgb_cal_prob_H
3797,2013/2014,France Ligue 1,2013-08-09,D,0.369162,0.402187,0.228651
3798,2013/2014,France Ligue 1,2013-08-10,H,0.258358,0.389907,0.351735
3799,2013/2014,France Ligue 1,2013-08-10,A,0.312524,0.395215,0.292261
3800,2013/2014,France Ligue 1,2013-08-10,D,0.251546,0.305435,0.443019
3801,2013/2014,France Ligue 1,2013-08-10,H,0.210321,0.380174,0.409506


EV summary (XGB calibrated):
count    6837.000000
mean        0.037486
std         0.296986
min        -0.639604
25%        -0.154657
50%        -0.029468
75%         0.178409
max         3.880642
Name: EV, dtype: float64

Number of XGB value bets (EV>0): 3054
Number of XGB value bets (EV>0.05): 2575

=== Backtest XGB calibrated R1 (EV>0) ===
n_bets: 3054
initial_bankroll: 100.0
final_bankroll: -78.90000000000015
total_gain: -178.90000000000015
ROI: -1.7890000000000015
avg_gain_per_bet: -0.058578912901113306
max_drawdown: -1.6552896382920872

=== Backtest XGB calibrated R2 (EV>0.05) ===
n_bets: 2575
initial_bankroll: 100.0
final_bankroll: -55.52000000000005
total_gain: -155.52000000000004
ROI: -1.5552000000000004
avg_gain_per_bet: -0.06039611650485437
max_drawdown: -1.5186507635037612


**Summary**
- XGB with odds improves log-loss vs no-odds, but ROI remains negative.
- Isotonic calibration improves log-loss yet still overestimates EV.

**Interpretation**
- The market signal dominates; model errors create false value bets.
- With this dataset, a robust positive edge is unlikely without richer features.

Next: try conservative filters and a confidence threshold.

In [17]:
bets_conv = bets_xgb.copy()

# Parameters to test
EV_MIN = 0.20
ODDS_MIN = 1.5
ODDS_MAX = 5.0
LEAGUES_KEEP = ["France Ligue 1", "England Premier League"]

mask = (
    (bets_conv["EV"] > EV_MIN) &
    (bets_conv["odds"] >= ODDS_MIN) &
    (bets_conv["odds"] <= ODDS_MAX) &
    (bets_conv["league_name"].isin(LEAGUES_KEEP))
)

bets_ultra = bets_conv[mask].copy()
print("Number of ultra-conservative bets:", len(bets_ultra))
display(bets_ultra.head())

bt_ultra = backtest_flat_stake(bets_ultra, stake=1.0, initial_bankroll=100.0)
print("=== Backtest XGB calibrated ultra-conservative ===")
for k, v in bt_ultra.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

Number of ultra-conservative bets: 955


Unnamed: 0,id,date,season,league_name,home_team_api_id,away_team_api_id,result,odds,model_prob,outcome,EV
37,3747,2013-08-24,2013/2014,England Premier League,10194,9826,H,5.0,0.246781,A,0.233904
78,7006,2013-09-14,2013/2014,France Ligue 1,9830,9874,H,4.33,0.285221,A,0.235006
90,7017,2013-09-20,2013/2014,France Ligue 1,9853,9941,A,5.0,0.260032,A,0.300162
99,3965,2013-09-21,2013/2014,England Premier League,10261,8667,A,5.0,0.259932,A,0.299659
219,3641,2013-11-03,2013/2014,England Premier League,8668,8586,D,3.0,0.420431,A,0.261294


=== Backtest XGB calibrated ultra-conservative ===
n_bets: 955
initial_bankroll: 100.0
final_bankroll: 37.929999999999886
total_gain: -62.070000000000114
ROI: -0.6207000000000011
avg_gain_per_bet: -0.06499476439790575
max_drawdown: -0.8541649012795535


We can vary `EV_MIN`, `ODDS_MAX`, and optionally restrict to one league.

## 2. Confidence filter vs market probability

Keep bets where the model strongly disagrees with the market.

In [18]:
# Merge bets_xgb with market probabilities
test_df = data_xgb[data_xgb["season_start"] >= 2013].copy()

market_prob_map = {
    "A": "b365_prob_A",
    "D": "b365_prob_D",
    "H": "b365_prob_H",
}

bets_conf = bets_conv.copy()

# Build long market-prob table and merge by id + outcome
test_long = []
for outcome, m_col in market_prob_map.items():
    tmp = test_df[["id", m_col]].copy()
    tmp["outcome"] = outcome
    tmp.rename(columns={m_col: "market_prob"}, inplace=True)
    test_long.append(tmp)
test_long = pd.concat(test_long, ignore_index=True)

bets_conf = bets_conv.merge(
    test_long[["id", "outcome", "market_prob"]],
    on=["id", "outcome"],
    how="left"
)

bets_conf["prob_diff"] = bets_conf["model_prob"] - bets_conf["market_prob"]
bets_conf["abs_prob_diff"] = bets_conf["prob_diff"].abs()

print("Preview of bets_conf:")
display(bets_conf.head())

Preview of bets_conf:


Unnamed: 0,id,date,season,league_name,home_team_api_id,away_team_api_id,result,odds,model_prob,outcome,EV,market_prob,prob_diff,abs_prob_diff
0,6675,2013-08-09,2013/2014,France Ligue 1,10249,9847,D,1.75,0.369162,A,-0.353966,0.535211,-0.166049,0.166049
1,6676,2013-08-10,2013/2014,France Ligue 1,9830,7794,H,3.4,0.258358,A,-0.121584,0.277171,-0.018813,0.018813
2,6670,2013-08-10,2013/2014,France Ligue 1,9827,9829,A,2.5,0.312524,A,-0.218689,0.375,-0.062476,0.062476
3,6671,2013-08-10,2013/2014,France Ligue 1,4087,9874,D,3.6,0.251546,A,-0.094436,0.260708,-0.009162,0.009162
4,6673,2013-08-10,2013/2014,France Ligue 1,8639,8689,H,6.0,0.210321,A,0.261925,0.156977,0.053344,0.053344


### Strategy: EV + confidence

In [19]:
# Parameters
EV_MIN = 0.10
DELTA_MIN = 0.10
ODDS_MIN = 1.5
ODDS_MAX = 5.0

mask_conf = (
    (bets_conf["EV"] > EV_MIN) &
    (bets_conf["abs_prob_diff"] > DELTA_MIN) &
    (bets_conf["odds"] >= ODDS_MIN) &
    (bets_conf["odds"] <= ODDS_MAX)
)

bets_conf_sel = bets_conf[mask_conf].copy()
print("Number of EV+confidence bets:", len(bets_conf_sel))

bt_conf = backtest_flat_stake(bets_conf_sel, stake=1.0, initial_bankroll=100.0)
print("\n=== Backtest XGB calibrated EV+confidence ===")
for k, v in bt_conf.items():
    if k != "bankroll_history":
        print(f"{k}: {v}")

Number of EV+confidence bets: 551

=== Backtest XGB calibrated EV+confidence ===
n_bets: 551
initial_bankroll: 100.0
final_bankroll: 61.22999999999993
total_gain: -38.77000000000007
ROI: -0.38770000000000066
avg_gain_per_bet: -0.07036297640653358
max_drawdown: -0.6333753744285042


Test several pairs `(EV_MIN, DELTA_MIN)` and compare ROI (often with very few bets).

## 3. Targeted feature: form x market interaction

In [20]:
data_int = matches_with_attr.copy()
data_int["season_start"] = data_int["season"].apply(lambda s: int(s.split("/")[0]))

# Home vs away form edge
data_int["home_form_edge"] = data_int["home_roll_win_rate"] - data_int["away_roll_win_rate"]

# Interaction: form edge x market prob
data_int["home_form_x_market"] = data_int["home_form_edge"] * data_int["b365_prob_H"]

X_int = data_int[feature_with_odds + ["home_form_edge", "home_form_x_market"]].copy()
X_int = X_int.fillna(X_int.median(numeric_only=True))

train_mask = data_int["season_start"] <= 2011
val_mask   = data_int["season_start"] == 2012
test_mask  = data_int["season_start"] >= 2013

X_train_int, y_train_int = X_int[train_mask], y_enc[train_mask]
X_val_int,   y_val_int   = X_int[val_mask],   y_enc[val_mask]
X_test_int,  y_test_int  = X_int[test_mask],  y_enc[test_mask]

xgb_int = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    tree_method="hist",
    max_depth=4,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_int.fit(X_train_int, y_train_int)

proba_test_int = xgb_int.predict_proba(X_test_int)
y_test_pred_int = xgb_int.predict(X_test_int)

ll_test_int = log_loss(y_test_int, proba_test_int, labels=xgb_int.classes_)
acc_test_int = accuracy_score(y_test_int, y_test_pred_int)

print("=== XGB with form x odds interactions (uncalibrated) ===")
print(f"Test log-loss: {ll_test_int:.4f}")
print(f"Test accuracy: {acc_test_int:.4f}")

=== XGB with form x odds interactions (uncalibrated) ===
Test log-loss: 1.0292
Test accuracy: 0.4958


Final attempts confirm the same pattern: calibration improves log-loss, but ROI stays negative.
This is a valid outcome for the report: the pipeline works, but the market is hard to beat.

# Phase 9 - Tables and figures for the report

In [21]:
# From df (match-level, 2 leagues, with result)
table_overview = (
    df
    .groupby("league_name")
    .agg(
        n_matches=("id", "count"),
        season_min=("season", lambda s: s.min()),
        season_max=("season", lambda s: s.max()),
        home_win_pct=("result", lambda r: (r == "H").mean()),
        draw_pct=("result", lambda r: (r == "D").mean()),
        away_win_pct=("result", lambda r: (r == "A").mean()),
    )
    .reset_index()
)

for col in ["home_win_pct", "draw_pct", "away_win_pct"]:
    table_overview[col] = (100 * table_overview[col]).round(1)

table_overview.to_csv("results/table_overview_dataset.csv", index=False)
table_overview.head()

Unnamed: 0,league_name,n_matches,season_min,season_max,home_win_pct,draw_pct,away_win_pct
0,England Premier League,3040,2008/2009,2015/2016,45.7,25.8,28.5
1,France Ligue 1,3036,2008/2009,2015/2016,44.7,28.3,27.1


### Table 2 - Form feature descriptions

In [22]:
form_features = [
    "home_roll_goals_for", "home_roll_goals_against",
    "home_roll_points", "home_roll_win_rate",
    "away_roll_goals_for", "away_roll_goals_against",
    "away_roll_points", "away_roll_win_rate",
]

descriptions = [
    "Avg goals scored (home) over last N matches",
    "Avg goals conceded (home) over last N matches",
    "Avg points (home, 3/1/0) over last N matches",
    "Win rate (home) over last N matches",
    "Avg goals scored (away) over last N matches",
    "Avg goals conceded (away) over last N matches",
    "Avg points (away, 3/1/0) over last N matches",
    "Win rate (away) over last N matches",
]

table_features_form = pd.DataFrame({
    "feature": form_features,
    "description": descriptions
})

table_features_form.to_csv("results/table_features_form.csv", index=False)
table_features_form

Unnamed: 0,feature,description
0,home_roll_goals_for,Avg goals scored (home) over last N matches
1,home_roll_goals_against,Avg goals conceded (home) over last N matches
2,home_roll_points,"Avg points (home, 3/1/0) over last N matches"
3,home_roll_win_rate,Win rate (home) over last N matches
4,away_roll_goals_for,Avg goals scored (away) over last N matches
5,away_roll_goals_against,Avg goals conceded (away) over last N matches
6,away_roll_points,"Avg points (away, 3/1/0) over last N matches"
7,away_roll_win_rate,Win rate (away) over last N matches


### Table 3 - Team_Attributes descriptions

In [23]:
team_attr_features = [
    "buildUpPlaySpeed", "buildUpPlayPassing",
    "chanceCreationPassing", "chanceCreationCrossing",
    "chanceCreationShooting",
    "defencePressure", "defenceAggression", "defenceTeamWidth"
]

descriptions_attr = [
    "Build-up play speed",
    "Build-up play passing quality",
    "Chance creation via passing",
    "Chance creation via crossing",
    "Chance creation via shooting",
    "Defensive pressure intensity",
    "Defensive aggression",
    "Defensive team width",
]

table_team_attributes = pd.DataFrame({
    "attribute": team_attr_features,
    "description": descriptions_attr
})

table_team_attributes.to_csv("results/table_team_attributes.csv", index=False)
table_team_attributes

Unnamed: 0,attribute,description
0,buildUpPlaySpeed,Build-up play speed
1,buildUpPlayPassing,Build-up play passing quality
2,chanceCreationPassing,Chance creation via passing
3,chanceCreationCrossing,Chance creation via crossing
4,chanceCreationShooting,Chance creation via shooting
5,defencePressure,Defensive pressure intensity
6,defenceAggression,Defensive aggression
7,defenceTeamWidth,Defensive team width


### Table 4 - Baselines (naive, market)

In [24]:
# From data (matches_with_attr with result and b365_prob_*)
test_mask = data["season_start"] >= 2013
y_test = data.loc[test_mask, "result"]

# Baseline 1: always predict H
y_pred_naive = np.array(["H"] * len(y_test))

# Baseline 2: most likely outcome by Bet365
probs_mkt = data.loc[test_mask, ["b365_prob_A", "b365_prob_D", "b365_prob_H"]].values
labels = np.array(["A", "D", "H"])
y_pred_market = labels[probs_mkt.argmax(axis=1)]

ll_market = log_loss(
    y_test,
    probs_mkt,
    labels=labels
)

acc_naive = accuracy_score(y_test, y_pred_naive)
acc_market = accuracy_score(y_test, y_pred_market)

table_baselines = pd.DataFrame([
    {"model": "Always Home (naive)", "log_loss_test": np.nan, "accuracy_test": acc_naive},
    {"model": "Market (Bet365 probs)", "log_loss_test": ll_market, "accuracy_test": acc_market},
]).round(4)

table_baselines.to_csv("results/table_baselines.csv", index=False)
table_baselines

Unnamed: 0,model,log_loss_test,accuracy_test
0,Always Home (naive),,0.4462
1,Market (Bet365 probs),0.9851,0.5287


### Table 5 - LogReg performance

In [25]:
# Assumes ll_val, ll_test, acc_val, acc_test were computed for LogReg
table_logreg = pd.DataFrame([{
    "model": "Logistic Regression (multinomial)",
    "log_loss_val": ll_val,
    "log_loss_test": ll_test,
    "accuracy_val": acc_val,
    "accuracy_test": acc_test,
}]).round(4)

table_logreg.to_csv("results/table_logreg_performance.csv", index=False)
table_logreg

Unnamed: 0,model,log_loss_val,log_loss_test,accuracy_val,accuracy_test
0,Logistic Regression (multinomial),0.9935,0.9984,0.5105,0.5195


### Table 6 - LogReg vs XGB (no odds / with odds)

In [26]:
table_models = pd.DataFrame([
    {
        "model": "LogReg + odds + form + attributes",
        "log_loss_test": ll_test,
        "accuracy_test": acc_test,
    },
    {
        "model": "XGBoost no odds",
        "log_loss_test": 1.0750,
        "accuracy_test": 0.4546,
    },
    {
        "model": "XGBoost with odds",
        "log_loss_test": 1.0319,
        "accuracy_test": 0.4976,
    },
]).round(4)

table_models.to_csv("results/table_models_comparison.csv", index=False)
table_models

Unnamed: 0,model,log_loss_test,accuracy_test
0,LogReg + odds + form + attributes,0.9984,0.5195
1,XGBoost no odds,1.075,0.4546
2,XGBoost with odds,1.0319,0.4976


### Table 7 - XGB before / after calibration

In [27]:
table_xgb_calib = pd.DataFrame([
    {
        "model": "XGB with odds (uncalibrated)",
        "log_loss_test": 1.0319,
        "accuracy_test": 0.4976,
    },
    {
        "model": "XGB with odds (isotonic calibrated)",
        "log_loss_test": 1.0156,
        "accuracy_test": 0.4901,
    },
])

table_xgb_calib.to_csv("results/table_xgb_calibration.csv", index=False)
table_xgb_calib

Unnamed: 0,model,log_loss_test,accuracy_test
0,XGB with odds (uncalibrated),1.0319,0.4976
1,XGB with odds (isotonic calibrated),1.0156,0.4901


### Table 8 - EV stats (LogReg)

In [28]:
# bets = LogReg bets (Phase 5.1)
ev_desc_logreg = bets["EV"].describe().to_frame(name="value").reset_index().round(4)
ev_desc_logreg.rename(columns={"index": "stat"}, inplace=True)

ev_desc_logreg.to_csv("results/table_ev_logreg.csv", index=False)
ev_desc_logreg

Unnamed: 0,stat,value
0,count,6837.0
1,mean,-0.0581
2,std,0.1624
3,min,-0.8122
4,25%,-0.1461
5,50%,-0.0485
6,75%,0.0386
7,max,0.7558


### Table 9 - EV stats (XGB calibrated)

In [29]:
# bets_xgb = bets from calibrated XGB
ev_desc_xgb = bets_xgb["EV"].describe().to_frame(name="value").reset_index().round(4)
ev_desc_xgb.rename(columns={"index": "stat"}, inplace=True)

ev_desc_xgb.to_csv("results/table_ev_xgb_calibrated.csv", index=False)
ev_desc_xgb

Unnamed: 0,stat,value
0,count,6837.0
1,mean,0.0375
2,std,0.297
3,min,-0.6396
4,25%,-0.1547
5,50%,-0.0295
6,75%,0.1784
7,max,3.8806


### Table 10 - LogReg strategies (R1/R2/R3)

In [30]:
# Reuse backtest_flat_stake + value_R1, value_R2, value_R3
bt_R1 = backtest_flat_stake(value_R1, stake=1.0, initial_bankroll=100.0)
bt_R2 = backtest_flat_stake(value_R2, stake=1.0, initial_bankroll=100.0)
bt_R3 = backtest_flat_stake(value_R3, stake=1.0, initial_bankroll=100.0)

def bt_to_row(name, bt):
    return {
        "strategy": name,
        "n_bets": bt["n_bets"],
        "final_bankroll": bt["final_bankroll"],
        "ROI": bt["ROI"],
        "max_drawdown": bt["max_drawdown"],
        "avg_gain_per_bet": bt["avg_gain_per_bet"],
    }

table_strat_logreg = pd.DataFrame([
    bt_to_row("LogReg R1 (EV>0)", bt_R1),
    bt_to_row("LogReg R2 (EV>0.05)", bt_R2),
    bt_to_row("LogReg R3 (favorite, EV>0)", bt_R3),
]).round(4)

table_strat_logreg.to_csv("results/table_strategies_logreg.csv", index=False)
table_strat_logreg

Unnamed: 0,strategy,n_bets,final_bankroll,ROI,max_drawdown,avg_gain_per_bet
0,LogReg R1 (EV>0),2407,-85.62,-1.8562,-1.9185,-0.0771
1,LogReg R2 (EV>0.05),1525,5.19,-0.9481,-1.0322,-0.0622
2,"LogReg R3 (favorite, EV>0)",973,68.19,-0.3181,-0.4331,-0.0327


### Table 11 - XGB calibrated strategies

In [31]:
# Reuse bets_ultra, bets_conf_sel (or recompute with chosen filters)
bt_R1_xgb = backtest_flat_stake(value_R1_xgb, stake=1.0, initial_bankroll=100.0)
bt_R2_xgb = backtest_flat_stake(value_R2_xgb, stake=1.0, initial_bankroll=100.0)
bt_ultra = backtest_flat_stake(bets_ultra, stake=1.0, initial_bankroll=100.0)
bt_conf = backtest_flat_stake(bets_conf_sel, stake=1.0, initial_bankroll=100.0)

table_strat_xgb = pd.DataFrame([
    bt_to_row("XGB calib R1 (EV>0)", bt_R1_xgb),
    bt_to_row("XGB calib R2 (EV>0.05)", bt_R2_xgb),
    bt_to_row("XGB calib ultra-conservative", bt_ultra),
    bt_to_row("XGB calib EV+confidence", bt_conf),
]).round(4)

table_strat_xgb.to_csv("results/table_strategies_xgb_calibrated.csv", index=False)
table_strat_xgb

Unnamed: 0,strategy,n_bets,final_bankroll,ROI,max_drawdown,avg_gain_per_bet
0,XGB calib R1 (EV>0),3054,-78.9,-1.789,-1.6553,-0.0586
1,XGB calib R2 (EV>0.05),2575,-55.52,-1.5552,-1.5187,-0.0604
2,XGB calib ultra-conservative,955,37.93,-0.6207,-0.8542,-0.065
3,XGB calib EV+confidence,551,61.23,-0.3877,-0.6334,-0.0704


## Figures

### Figure 2 - Outcome distribution (H/D/A)

In [32]:
plt.figure(figsize=(6,4))
sns.countplot(x="result", data=df, order=["H","D","A"])
plt.xlabel("Match result")
plt.ylabel("Number of matches")
plt.title("Outcome distribution (H/D/A)")
plt.tight_layout()
plt.savefig("results/fig_distribution_results.png", dpi=300)
plt.close()

### Figure 3 - B365H vs b365_prob_H distribution

In [33]:
plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
sns.histplot(df["B365H"], bins=30, kde=False)
plt.xlabel("B365H odds")
plt.title("Distribution of B365H odds")

plt.subplot(1,2,2)
sns.histplot(df["b365_prob_H"], bins=30, kde=False)
plt.xlabel("Implied probability b365_prob_H")
plt.title("Distribution of implied probabilities")

plt.tight_layout()
plt.savefig("results/fig_b365_odds_vs_probs.png", dpi=300)
plt.close()

### Figure 4 - Example of home_roll_win_rate over time

In [34]:
# Pick a team with enough matches
team_example = df_with_form["home_team_api_id"].value_counts().idxmax()
df_team = df_with_form[df_with_form["home_team_api_id"] == team_example].copy()
df_team = df_team.sort_values("date")

plt.figure(figsize=(10,4))
plt.plot(df_team["date"], df_team["home_roll_win_rate"], marker="o", linestyle="-")
plt.xlabel("Date")
plt.ylabel("home_roll_win_rate")
plt.title(f"Home form over time (team_api_id={team_example})")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("results/fig_home_roll_win_rate_example.png", dpi=300)
plt.close()

### Figure 5 - Correlation heatmap: FIFA attributes vs b365_prob_H

In [35]:
attrs_cols = [
    "home_buildUpPlaySpeed", "home_buildUpPlayPassing",
    "home_chanceCreationPassing", "home_chanceCreationCrossing",
    "home_chanceCreationShooting",
    "home_defencePressure", "home_defenceAggression", "home_defenceTeamWidth",
    "b365_prob_H"
]

corr_df = matches_with_attr[attrs_cols].dropna()
corr = corr_df.corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlations: FIFA attributes (home) vs b365_prob_H")
plt.tight_layout()
plt.savefig("results/fig_corr_team_attributes_vs_market.png", dpi=300)
plt.close()

### Figure 6 - LogReg confusion matrix (test)

In [36]:
# Recompute if needed
cm = confusion_matrix(y_test, y_test_pred, labels=logreg.classes_)
cm_df = pd.DataFrame(cm, index=logreg.classes_, columns=logreg.classes_)

plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion matrix - LogReg (test)")
plt.tight_layout()
plt.savefig("results/fig_confusion_logreg.png", dpi=300)
plt.close()

### Figure 7 - Log-loss by model

In [37]:
# Uses table_models created above
plt.figure(figsize=(6,4))
sns.barplot(
    x="model", y="log_loss_test",
    data=table_models,
    hue="model",
    palette="viridis",
    legend=False
)
plt.xticks(rotation=20, ha="right")
plt.ylabel("Test log-loss")
plt.title("Test log-loss by model")
plt.tight_layout()
plt.savefig("results/fig_logloss_by_model.png", dpi=300)
plt.close()

### Figure 8 - Calibration curve (before/after)

In [38]:
# XGB uncalibrated probabilities for Home win
classes_enc = xgb_with_odds.classes_
classes_str = le.inverse_transform(classes_enc)
idx_H = np.where(classes_str == "H")[0][0]

proba_home_uncal = proba_test_with_odds[:, idx_H]
proba_home_cal = proba_test_cal[:, idx_H]

y_test_H = (y_test == le.transform(["H"])[0]).astype(int)

frac_pos_uncal, mean_pred_uncal = calibration_curve(
    y_test_H, proba_home_uncal, n_bins=10, strategy="uniform"
)
frac_pos_cal, mean_pred_cal = calibration_curve(
    y_test_H, proba_home_cal, n_bins=10, strategy="uniform"
)

plt.figure(figsize=(6,6))
plt.plot([0,1], [0,1], "k--", label="Perfectly calibrated")
plt.plot(mean_pred_uncal, frac_pos_uncal, "o-", label="XGB uncalibrated")
plt.plot(mean_pred_cal, frac_pos_cal, "o-", label="XGB isotonic calibrated")
plt.xlabel("Predicted probability (Home win)")
plt.ylabel("Observed frequency")
plt.title("Calibration curve - XGB (Home win)")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig("results/fig_calibration_xgb.png", dpi=300)
plt.close()

### Figure 9 - EV histograms (LogReg vs XGB calibrated)

In [39]:
plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
sns.histplot(bets["EV"], bins=40, kde=False)
plt.title("EV distribution - LogReg")
plt.xlabel("EV")
plt.ylabel("Number of bets")

plt.subplot(1,2,2)
sns.histplot(bets_xgb["EV"], bins=40, kde=False)
plt.title("EV distribution - XGB calibrated")
plt.xlabel("EV")
plt.ylabel("Number of bets")

plt.tight_layout()
plt.savefig("results/fig_hist_ev_logreg_vs_xgb.png", dpi=300)
plt.close()

### Figure 10 - Heatmap $M^{market}$ vs $M^{model}$ (Ligue 1)

In [40]:
# Use a smaller subset if the matrix is large
teams_L1 = M_market_L1.index[:10]
M_mkt_sub = M_market_L1.loc[teams_L1, teams_L1]
M_mod_sub = M_model_L1.loc[teams_L1, teams_L1]

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.heatmap(M_mkt_sub, cmap="viridis", vmin=0, vmax=1)
plt.title("M^market (home win prob) - Ligue 1")
plt.xlabel("Away team_id")
plt.ylabel("Home team_id")

plt.subplot(1,2,2)
sns.heatmap(M_mod_sub, cmap="viridis", vmin=0, vmax=1)
plt.title("M^model (home win prob) - Ligue 1")
plt.xlabel("Away team_id")
plt.ylabel("Home team_id")

plt.tight_layout()
plt.savefig("results/fig_heatmap_matrices_ligue1.png", dpi=300)
plt.close()

### Figure 11 - Bankroll curve: LogReg R1 vs no-bet baseline

In [41]:
bt_R1 = backtest_flat_stake(value_R1, stake=1.0, initial_bankroll=100.0)
bankroll_hist = bt_R1["bankroll_history"]

plt.figure(figsize=(8,4))
plt.plot(range(len(bankroll_hist)), bankroll_hist, label="LogReg R1 (EV>0)")
plt.axhline(100.0, color="grey", linestyle="--", label="No-bet baseline")
plt.xlabel("Number of bets")
plt.ylabel("Bankroll")
plt.title("Bankroll over time - LogReg R1 vs no-bet")
plt.legend()
plt.tight_layout()
plt.savefig("results/fig_bankroll_logreg_R1.png", dpi=300)
plt.close()

### Figure 12 - Boxplot of bet gains for XGB strategies

In [42]:
# Build per-bet gain arrays for each strategy
def gains_per_bet(bets_subset, stake=1.0):
    gains = []
    for _, row in bets_subset.iterrows():
        if row["result"] == row["outcome"]:
            gain = stake * (row["odds"] - 1.0)
        else:
            gain = -stake
        gains.append(gain)
    return np.array(gains)

data_box = pd.DataFrame({
    "gain": np.concatenate([
        gains_per_bet(value_R1_xgb),
        gains_per_bet(value_R2_xgb),
        gains_per_bet(bets_ultra),
        gains_per_bet(bets_conf_sel),
    ]),
    "strategy": (
        ["XGB R1 (EV>0)"] * len(value_R1_xgb)
        + ["XGB R2 (EV>0.05)"] * len(value_R2_xgb)
        + ["XGB ultra"] * len(bets_ultra)
        + ["XGB EV+confidence"] * len(bets_conf_sel)
    )
})

plt.figure(figsize=(8,4))
sns.boxplot(x="strategy", y="gain", data=data_box)
plt.axhline(0.0, color="grey", linestyle="--")
plt.ylabel("Gain per bet")
plt.title("Distribution of gains per bet - XGB calibrated")
plt.xticks(rotation=15, ha="right")
plt.tight_layout()
plt.savefig("results/fig_boxplot_gain_per_bet_xgb.png", dpi=300)
plt.close()