In [154]:
# Get all matches from 2022-2023 season

import requests

# Get API token from .env file
with open(".env") as f:
  for line in f:
    if "FOOTBALL-DATA-API-KEY" in line:
      token = line.split("=")[1].strip()
      break

headers = { "X-Auth-Token": token }

def get_data_by_year(year):
  uri = f"https://api.football-data.org/v4/competitions/2021/matches?season={year}"
  response = requests.get(uri, headers=headers)
  return response.json()

In [190]:
import pandas as pd
import re

def build_data_by_year(year):
    df = pd.DataFrame(get_data_by_year(year)["matches"])

    # Drop columns we don"t need
    df = df[["utcDate", "matchday", "homeTeam", "awayTeam", "score"]]

    # Parse the homeTeam and awayTeam and get the names or ids from the column
    df["home"] = df["homeTeam"].apply(lambda x: x["name"])
    df["away"] = df["awayTeam"].apply(lambda x: x["name"])
    df = df.drop(columns=["homeTeam", "awayTeam"])

    # Get home team and away team scores from the dataframe, drop score column
    df["homeScore"] = df["score"].apply(lambda x: x["fullTime"]["home"])
    df["awayScore"] = df["score"].apply(lambda x: x["fullTime"]["away"])
    df = df.drop(columns=["score"])

    # Convert utcDate to datetime
    df["utcDate"] = pd.to_datetime(df["utcDate"])

    # Function to convert camel case to title case
    def camel_to_title(camel_str):
        title_str = re.sub("([A-Z])", r" \1", camel_str)
        return title_str.title()

    # Apply the function to each column name
    df.columns = [camel_to_title(col) for col in df.columns]

    return df

# Get data for 2022-2023 season
df = build_data_by_year(2022)

# Get data for the 2023-2024 season
df_2023 = build_data_by_year(2023)

## Setting up Training Data

In [191]:
# Initialize elo ratings for each team
elo = {team: 1500 for team in df["Home"].unique()}

def update_elo_win(winner_elo, loser_elo, k=40):
    expected_win = 1.0 / (1 + 10**((loser_elo - winner_elo) / 400))
    change = k * (1 - expected_win)
    return winner_elo + change, loser_elo - change

def update_elo_draw(home_elo, away_elo, k=40):
    expected_home_win = 1.0 / (1 + 10**((away_elo - home_elo) / 400))
    change = k * (0.5 - expected_home_win)
    return home_elo + change, away_elo - change

# Process matches and update ELO ratings
for index, row in df.iterrows():
    home_team, away_team = row["Home"], row["Away"]
    
    if row["Home Score"] > row["Away Score"]:  # Home team won
        elo[home_team], elo[away_team] = update_elo_win(elo[home_team], elo[away_team])
    elif row["Away Score"] > row["Home Score"]:  # Away team won
        elo[away_team], elo[home_team] = update_elo_win(elo[away_team], elo[home_team])
    else: # Draw
        elo[home_team], elo[away_team] = update_elo_draw(elo[home_team], elo[away_team])
    
    df.at[index, "Home Elo"] = elo[home_team]
    df.at[index, "Away Elo"] = elo[away_team]

# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df["Home Outcome"] = 1
df["Away Outcome"] = 1
df.loc[df["Home Score"] > df["Away Score"], "Home Outcome"] = 3
df.loc[df["Home Score"] > df["Away Score"], "Away Outcome"] = 0
df.loc[df["Away Score"] > df["Home Score"], "Away Outcome"] = 3
df.loc[df["Away Score"] > df["Home Score"], "Home Outcome"] = 0

In [199]:
def get_season_results(df: pd.DataFrame) -> pd.DataFrame:
    # Aggregate results
    home_results = df.groupby("Home").agg({"Home Outcome": "sum", "Home Elo": "last"})
    away_results = df.groupby("Away").agg({"Away Outcome": "sum", "Away Elo": "last"})
    results = home_results.join(away_results, how="outer").fillna(0)
    results["Total Outcome"] = results["Home Outcome"] + results["Away Outcome"]
    results["Total Elo"] = results["Home Elo"] + results["Away Elo"]

    # Sort by total outcome and total ELO
    results = results.sort_values(by=["Total Outcome", "Total Elo"], ascending=False)
    return results

results = get_season_results(df)

## Training

In [193]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[["Home Elo", "Away Elo"]]
y = df["Home Outcome"]

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# Train the model
model = RandomForestClassifier(n_estimators=1000, max_depth=5, min_samples_split=5)
model.fit(x_train, y_train)

# Validate the model
val_score = model.score(x_val, y_val)
print(f"Validation accuracy: {val_score * 100:.2f}%")

Validation accuracy: 69.74%


## Test Against 2023-2024 Season

In [200]:
import warnings
import numpy as np

def simulate_match(home_elo, away_elo,):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = scaler.transform([[home_elo, away_elo]])
    probabilities = model.predict_proba(x)
    return np.random.choice([0, 1, 3], p=probabilities[0])

In [202]:
# Find teams that have been relegated/promoted by taking a difference of the two dataframes
df_teams = pd.concat([df["Home"], df["Away"]]).unique()
df_2023_teams = pd.concat([df_2023["Home"], df_2023["Away"]]).unique()

relegated_teams = set(df_teams) - set(df_2023_teams)
promoted_teams = set(df_2023_teams) - set(df_teams)
teams_with_baseline = set(df_teams) & set(df_2023_teams)

# Find the average ending ELO rating for the teams that have been relegated
relegated_elo = results.loc[list(relegated_teams), "Total Elo"].mean()

# Set the starting ELO rating for the promoted teams to the average ending ELO rating of the relegated teams
elo = {team: relegated_elo for team in promoted_teams}

# Set the starting ELO rating for the teams that have been in the league for both seasons to their ending ELO rating
elo.update(results.loc[list(teams_with_baseline), "Total Elo"].to_dict())

# Divide elo by 2 to get home and away elo
home_elo = {team: elo / 2 for team, elo in elo.items()}
away_elo = {team: elo / 2 for team, elo in elo.items()}

for index, row in df_2023.iterrows():
    home_team, away_team = row["Home"], row["Away"]

    # Simulate match and update ELO ratings
    outcome = simulate_match(home_elo[home_team], away_elo[away_team])
    
    match outcome:
        case 3:  # Home team won
            home_elo[home_team], away_elo[away_team] = update_elo_win(home_elo[home_team], away_elo[away_team])
            df_2023.at[index, "Home Outcome"] = 3
            df_2023.at[index, "Away Outcome"] = 0
        case 0:  # Away team won
            away_elo[away_team], home_elo[home_team] = update_elo_win(away_elo[away_team], home_elo[home_team])
            df_2023.at[index, "Away Outcome"] = 3
            df_2023.at[index, "Home Outcome"] = 0
        case 1:  # Draw
            home_elo[home_team], away_elo[away_team] = update_elo_draw(home_elo[home_team], away_elo[away_team])
            df_2023.at[index, "Home Outcome"] = 1
            df_2023.at[index, "Away Outcome"] = 1
    
    df_2023.at[index, "Home Elo"] = home_elo[home_team]
    df_2023.at[index, "Away Elo"] = away_elo[away_team]

get_season_results(df_2023)

KeyError: "Column(s) ['Home Outcome'] do not exist"