In [154]:
# Get all matches from 2022-2023 season

import requests

# Get API token from .env file
with open(".env") as f:
  for line in f:
    if "FOOTBALL-DATA-API-KEY" in line:
      token = line.split("=")[1].strip()
      break

headers = { "X-Auth-Token": token }

def get_data_by_year(year):
  uri = f"https://api.football-data.org/v4/competitions/2021/matches?season={year}"
  response = requests.get(uri, headers=headers)
  return response.json()

In [190]:
import pandas as pd
import re

def build_data_by_year(year):
    df = pd.DataFrame(get_data_by_year(year)["matches"])

    # Drop columns we don"t need
    df = df[["utcDate", "matchday", "homeTeam", "awayTeam", "score"]]

    # Parse the homeTeam and awayTeam and get the names or ids from the column
    df["home"] = df["homeTeam"].apply(lambda x: x["name"])
    df["away"] = df["awayTeam"].apply(lambda x: x["name"])
    df = df.drop(columns=["homeTeam", "awayTeam"])

    # Get home team and away team scores from the dataframe, drop score column
    df["homeScore"] = df["score"].apply(lambda x: x["fullTime"]["home"])
    df["awayScore"] = df["score"].apply(lambda x: x["fullTime"]["away"])
    df = df.drop(columns=["score"])

    # Convert utcDate to datetime
    df["utcDate"] = pd.to_datetime(df["utcDate"])

    # Function to convert camel case to title case
    def camel_to_title(camel_str):
        title_str = re.sub("([A-Z])", r" \1", camel_str)
        return title_str.title()

    # Apply the function to each column name
    df.columns = [camel_to_title(col) for col in df.columns]

    return df

# Get data for 2022-2023 season
df = build_data_by_year(2022)

# Get data for the 2023-2024 season
df_2023 = build_data_by_year(2023)

## Setting up Training Data

In [248]:
from simulation_utils import update_elo_win, update_elo_draw

# Initialize elo ratings for each team
elo = {team: 1500 for team in df["Home"].unique()}

# Process matches and update ELO ratings
for index, row in df.iterrows():
    home_team, away_team = row["Home"], row["Away"]
    
    if row["Home Score"] > row["Away Score"]:  # Home team won
        elo[home_team], elo[away_team] = update_elo_win(elo[home_team], elo[away_team])
    elif row["Away Score"] > row["Home Score"]:  # Away team won
        elo[away_team], elo[home_team] = update_elo_win(elo[away_team], elo[home_team])
    else: # Draw
        elo[home_team], elo[away_team] = update_elo_draw(elo[home_team], elo[away_team])
    
    df.at[index, "Home Elo"] = elo[home_team]
    df.at[index, "Away Elo"] = elo[away_team]

# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df["Home Outcome"] = 1
df["Away Outcome"] = 1
df.loc[df["Home Score"] > df["Away Score"], "Home Outcome"] = 3
df.loc[df["Home Score"] > df["Away Score"], "Away Outcome"] = 0
df.loc[df["Away Score"] > df["Home Score"], "Away Outcome"] = 3
df.loc[df["Away Score"] > df["Home Score"], "Home Outcome"] = 0

ImportError: cannot import name 'simulate_match' from 'simulation_utils' (/Users/brianalonso/Documents/4 Archives/PL Predictor/simulation_utils.py)

In [229]:
from simulation_utils import get_season_results

results = get_season_results(df)

## Training

In [235]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[["Home Elo", "Away Elo"]]
y = df["Home Outcome"]

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# Train the model
model = RandomForestClassifier(n_estimators=1000, max_depth=5, min_samples_split=5)
model.fit(x_train, y_train)

# Validate the model
val_score = model.score(x_val, y_val)
print(f"Validation accuracy: {val_score * 100:.2f}%")

Validation accuracy: 71.05%


### Build Elo

In [None]:
# Find teams that have been relegated/promoted by taking a difference of the two dataframes
df_teams = pd.concat([df["Home"], df["Away"]]).unique()
df_2023_teams = pd.concat([df_2023["Home"], df_2023["Away"]]).unique()

relegated_teams = set(df_teams) - set(df_2023_teams)
promoted_teams = set(df_2023_teams) - set(df_teams)
teams_with_baseline = set(df_teams) & set(df_2023_teams)

# Find the average ending ELO rating for the teams that have been relegated
relegated_elo = results.loc[list(relegated_teams), "Total Elo"].mean()

# Set the starting ELO rating for the promoted teams to the average ending ELO rating of the relegated teams
elo = {team: relegated_elo for team in promoted_teams}

# Set the starting ELO rating for the teams that have been in the league for both seasons to their ending ELO rating
elo.update(results.loc[list(teams_with_baseline), "Total Elo"].to_dict())

## Test Against 2023-2024 Season

In [243]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

from simulation_utils import simulate_and_get_results

num_simulations = 1000
# num_simulations = 20000

# Initialize a list to store results
seasons = []


# Initialize a pool of workers
with ProcessPoolExecutor() as executor:
    seasons = list(tqdm(executor.map(simulate_and_get_results, range(num_simulations), [df_2023]*num_simulations, [elo]*num_simulations, [model]*num_simulations, [scaler]*num_simulations),  total=num_simulations, desc='Simulating', unit='season'))

# for i in tqdm(range(num_simulations), desc='Simulating', unit='season'):
#     simulated_df = simulate_season(df_2023, elo)
#     results = get_season_results(simulated_df)
#     results["Season"] = i
#     seasons.append(results)

Simulating: 100%|██████████| 1000/1000 [12:00:24<00:00, 43.22s/season]   


### Analyze Results Compared to actual 2023-2024 Season

In [261]:
# Sort results based on total outcome
results = results.sort_values("Total Outcome", ascending=False)

# Get the place each team finished in the league
results["Place"] = range(1, len(results) + 1)

# Get 2023 season results
# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df_2023["Home Outcome"] = 1
df_2023["Away Outcome"] = 1
df_2023.loc[df_2023["Home Score"] > df_2023["Away Score"], "Home Outcome"] = 3
df_2023.loc[df_2023["Home Score"] > df_2023["Away Score"], "Away Outcome"] = 0
df_2023.loc[df_2023["Away Score"] > df_2023["Home Score"], "Away Outcome"] = 3
df_2023.loc[df_2023["Away Score"] > df_2023["Home Score"], "Home Outcome"] = 0

home_results = df_2023.groupby("Home").agg({"Home Outcome": "sum"})
away_results = df_2023.groupby("Away").agg({"Away Outcome": "sum"})
results_2023 = home_results.join(away_results, how="outer").fillna(0)
results_2023["Total Outcome"] = results_2023["Home Outcome"] + results_2023["Away Outcome"]
results_2023 = results_2023.sort_values("Total Outcome", ascending=False)
results_2023["Place"] = range(1, len(results_2023) + 1)

# For each season, get the place each team finished in the league
for index, season_df in enumerate(seasons):
    season_df["Place"] = range(1, len(season_df) + 1)

# Get the average place each team finished in the league
average_results = pd.concat(seasons).groupby("Home").agg({"Place": "mean"}).sort_values("Place")

# Get a mapping of team names to a list places they finished in the league
team_place_mapping = {}
for team in average_results.index:
    team_place_mapping[team] = [season_df.loc[team, "Place"] for season_df in seasons]

# Get the total number of seasons simulated
total_seasons = len(seasons)

# Get a mapping of times each team won the league
team_win_mapping = {}
for team in average_results.index:
    team_win_mapping[team] = sum([season_df.loc[team, "Place"] == 1 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the top 4
team_top_4_mapping = {}
for team in average_results.index:
    team_top_4_mapping[team] = sum([season_df.loc[team, "Place"] <= 4 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the bottom 3
team_bottom_3_mapping = {}
for team in average_results.index:
    team_bottom_3_mapping[team] = sum([season_df.loc[team, "Place"] >= len(season_df) - 3 for season_df in seasons]) / total_seasons

# Build a dataframe with the average place, times won, times in top 4, and times in bottom 3
average_results["Win Premier League"] = [team_win_mapping[team] for team in average_results.index]
average_results["Top 4"] = [team_top_4_mapping[team] for team in average_results.index]
average_results["Bottom 3"] = [team_bottom_3_mapping[team] for team in average_results.index]

# Show the dataframe
average_results

Unnamed: 0_level_0,Place,Win Premier League,Top 4,Bottom 3
Home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Manchester City FC,3.067,0.286,0.791,0.0
Manchester United FC,4.235,0.147,0.577,0.0
Arsenal FC,4.366,0.147,0.532,0.0
Liverpool FC,4.458,0.139,0.53,0.0
Newcastle United FC,4.718,0.106,0.498,0.0
Aston Villa FC,5.216,0.081,0.396,0.0
Brentford FC,5.751,0.056,0.348,0.0
Brighton & Hove Albion FC,6.026,0.037,0.273,0.0
Tottenham Hotspur FC,10.73,0.0,0.026,0.053
Fulham FC,11.53,0.0,0.013,0.085
