### Build Elo

In [4]:
%load_ext autoreload
%autoreload 2

from simulation_utils import db_get_data_for_latest_season

# Get data for the latest season
df = db_get_data_for_latest_season()

df

Unnamed: 0,utc_date,season,status,matchday,home,away,home_score,away_score,home_outcome,away_outcome,home_manager,away_manager,home_manager_count,away_manager_count,home_position,away_position
0,2023-08-11 19:00:00+00:00,2023,FINISHED,1,Burnley FC,Manchester City FC,0,3,0,3,Vincent Kompany,Pep Guardiola,0,0,2,1
1,2023-08-12 12:00:00+00:00,2023,FINISHED,1,Arsenal FC,Nottingham Forest FC,2,1,3,0,Mikel Arteta,Steve Cooper,0,0,1,4
2,2023-08-12 14:00:00+00:00,2023,FINISHED,1,AFC Bournemouth,West Ham United FC,1,1,1,1,Andoni Iraola,David Moyes,0,0,6,7
3,2023-08-12 14:00:00+00:00,2023,FINISHED,1,Brighton & Hove Albion FC,Luton Town FC,4,1,3,0,Roberto De Zerbi,Rob Edwards,0,0,2,10
4,2023-08-12 14:00:00+00:00,2023,FINISHED,1,Everton FC,Fulham FC,0,1,0,3,Sean Dyche,Marco Silva,0,0,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2024-05-19 15:00:00+00:00,2023,FINISHED,38,Crystal Palace FC,Aston Villa FC,5,0,3,0,Oliver Glasner,Unai Emery,1,0,10,4
376,2024-05-19 15:00:00+00:00,2023,FINISHED,38,Liverpool FC,Wolverhampton Wanderers FC,2,0,3,0,Jürgen Klopp,Gary O'Neil,0,0,3,15
377,2024-05-19 15:00:00+00:00,2023,FINISHED,38,Luton Town FC,Fulham FC,2,4,0,3,Rob Edwards,Marco Silva,0,0,18,14
378,2024-05-19 15:00:00+00:00,2023,FINISHED,38,Manchester City FC,West Ham United FC,3,1,3,0,Pep Guardiola,David Moyes,0,0,1,9


## Test Against 2023-2024 Season

### Predict 2023-2024 Season

In [5]:
from simulation_utils import build_elo_before_season

# Build the elo dataframe before the current season
elo_df = build_elo_before_season(df)
elo_df

Unnamed: 0_level_0,elo,club_value,normalized_club_value,exponential_club_value,normalized_exponential_club_value,adjusted_elo
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,1668.49153,1321200000.0,1.0,1.718282,1.0,1968.49153
Arsenal FC,1577.798728,1159300000.0,0.865716,1.376706,0.801211,1818.162014
Liverpool FC,1614.665931,871900000.0,0.627338,0.872619,0.507844,1767.019099
Manchester United FC,1601.55524,724650000.0,0.505205,0.657325,0.382548,1716.319535
Newcastle United FC,1587.669295,648950000.0,0.442417,0.556465,0.323849,1684.824122
Aston Villa FC,1583.967257,655600000.0,0.447933,0.565073,0.328859,1682.625104
Chelsea FC,1463.06222,946000000.0,0.688799,0.991322,0.576926,1636.140056
Tottenham Hotspur FC,1504.303037,793300000.0,0.562145,0.754432,0.439062,1636.021514
Brighton & Hove Albion FC,1553.700192,511750000.0,0.328619,0.389049,0.226418,1621.625447
Brentford FC,1571.553978,414875000.0,0.248269,0.281804,0.164003,1620.755011


In [8]:
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import joblib
from tqdm import tqdm
from functools import partial

from simulation_utils import download_best_params_from_s3, download_model_and_scaler_from_s3, get_elo_dict_from_df, simulate_and_get_results

# Get the model and scalar from S3
model_file = Path("random_forest.joblib")
scaler_file = Path("standard_scaler.joblib")
download_model_and_scaler_from_s3(model_file, scaler_file)
model = joblib.load(model_file)
scaler = joblib.load(scaler_file)

# Get adjusted Elo dict
adjusted_elo = get_elo_dict_from_df(elo_df)

# Number of simulations to run
num_simulations = 10
# num_simulations = 1000

# Initialize a list to store results
seasons = []

# Get the best parameters for the model
best_params = download_best_params_from_s3()

# Create a partial function to pass the same arguments to each simulation
simulate_and_get_results_partial = partial(simulate_and_get_results, df=df, elo=adjusted_elo, model=model, scaler=scaler, k=best_params.k, half_life=best_params.half_life, decay_method=best_params.decay_method)

# Initialize a pool of workers
with ProcessPoolExecutor() as executor:
    seasons = list(tqdm(executor.map(simulate_and_get_results_partial, range(num_simulations)),  total=num_simulations, desc='Simulating', unit='season'))

# Unlink the model and scaler files
model_file.unlink()
scaler_file.unlink()

Model downloaded to random_forest.joblib
Scaler downloaded to standard_scaler.joblib


Simulating: 100%|██████████| 10/10 [00:14<00:00,  1.46s/season]


### Analyze Results Compared to actual 2023-2024 Season

### Post Process Simulation Results

In [9]:
import pandas as pd

# Aggregate the results
results = pd.concat(seasons).groupby("team").agg({"total_outcome": "sum"})

# Sort results based on total outcome
results = results.sort_values("total_outcome", ascending=False)

# Get the place each team finished in the league
results["place"] = range(1, len(results) + 1)

# Get 2023 season results
# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df["home_outcome"] = 1
df["away_outcome"] = 1
df.loc[df["home_score"] > df["away_score"], "home_outcome"] = 3
df.loc[df["home_score"] > df["away_score"], "away_outcome"] = 0
df.loc[df["away_score"] > df["home_score"], "away_outcome"] = 3
df.loc[df["away_score"] > df["home_score"], "home_outcome"] = 0

home_results = df.groupby("home").agg({"home_outcome": "sum"})
away_results = df.groupby("away").agg({"away_outcome": "sum"})
results_2023 = home_results.join(away_results, how="outer").fillna(0)
results_2023["total_outcome"] = results_2023["home_outcome"] + results_2023["away_outcome"]
results_2023 = results_2023.sort_values("total_outcome", ascending=False)
results_2023["place"] = range(1, len(results_2023) + 1)

# For each season, get the place each team finished in the league
for index, season_df in enumerate(seasons):
    season_df["place"] = range(1, len(season_df) + 1)

# Get the average place each team finished in the league
average_results = pd.concat(seasons).groupby("team").agg({"place": "mean"}).sort_values("place")

# Get the season from seasons
list_of_seasons = df["season"].unique()

assert len(list_of_seasons) == 1, "All seasons should be the same"

# Get the season
simulation_season = list_of_seasons[0]

# Get a mapping of team names to a list places they finished in the league
team_place_mapping = {}
for team in average_results.index:
    team_place_mapping[team] = [season_df.loc[team, "place"] for season_df in seasons]

# Get the total number of seasons simulated
total_seasons = len(seasons)

# Get a mapping of times each team won the league
team_win_mapping = {}
for team in average_results.index:
    team_win_mapping[team] = sum([season_df.loc[team, "place"] == 1 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the top 4
team_top_4_mapping = {}
for team in average_results.index:
    team_top_4_mapping[team] = sum([season_df.loc[team, "place"] <= 4 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the bottom 3
team_bottom_3_mapping = {}
for team in average_results.index:
    team_bottom_3_mapping[team] = sum([season_df.loc[team, "place"] >= len(season_df) - 3 for season_df in seasons]) / total_seasons

# Build a dataframe with the average place, times won, times in top 4, and times in bottom 3
average_results["win_premier_league"] = [team_win_mapping[team] for team in average_results.index]
average_results["top_4"] = [team_top_4_mapping[team] for team in average_results.index]
average_results["bottom_3"] = [team_bottom_3_mapping[team] for team in average_results.index]

# Show the dataframe
average_results

Unnamed: 0_level_0,place,win_premier_league,top_4,bottom_3
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Manchester City FC,4.4,0.1,0.5,0.0
Aston Villa FC,5.5,0.1,0.3,0.0
Newcastle United FC,6.0,0.1,0.5,0.0
Chelsea FC,6.6,0.1,0.4,0.0
Tottenham Hotspur FC,6.8,0.2,0.3,0.0
Manchester United FC,7.2,0.0,0.4,0.0
Brentford FC,7.3,0.0,0.2,0.0
Liverpool FC,7.8,0.0,0.4,0.0
Brighton & Hove Albion FC,7.9,0.0,0.3,0.1
Arsenal FC,8.6,0.1,0.1,0.0


### Find the Positions of the Teams in the 2023-2024 Season for all Seasons

In [10]:
team_to_position = {team: {} for team in df["home"].unique()}
for season in seasons:
    # Assign the position of each team
    for i, team in enumerate(season.index, 1):
        if i not in team_to_position[team]:
            team_to_position[team][i] = 0
        team_to_position[team][i] += 1

# Create a dataframe where each row is a team and the one column is an array of positions in which the team finished in the league
team_positions_df = pd.DataFrame.from_dict(team_to_position, orient="index").fillna(0).astype(int)

# Stack the dataframe to get a row for each team-position pair
team_positions_df = team_positions_df.stack().reset_index().rename(columns={'level_0': 'team', 'level_1': 'position', 0: 'count'})

# Set the index to the team name-position pair
team_positions_df.set_index(["team", "position"], inplace=True)

# Sort the position index
team_positions_df = team_positions_df.sort_index()

team_positions_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
team,position,Unnamed: 2_level_1
AFC Bournemouth,1,1
AFC Bournemouth,2,0
AFC Bournemouth,3,0
AFC Bournemouth,4,0
AFC Bournemouth,5,0
...,...,...
Wolverhampton Wanderers FC,16,2
Wolverhampton Wanderers FC,17,1
Wolverhampton Wanderers FC,18,2
Wolverhampton Wanderers FC,19,1


## Store the Data

In [11]:
from simulation_utils import db_store_results

# Store the results in the database
db_store_results(simulation_season, average_results, team_positions_df)