In [1]:
from simulation_utils import build_data_by_year

# Get data for 2022-2023 season
df = build_data_by_year(2022)

# Get data for the 2023-2024 season
df_2023 = build_data_by_year(2023)

## Setting up Training Data

### Analyze Elo vs Outcome for Real Results

In [2]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.model_selection import ParameterGrid

from simulation_utils import DecayMethod, build_elo_between_seasons, get_elo_dict_from_df, process_fixture_results

# Define the parameter grid
param_grid = {
    'k': [40],
    'decay_half_life': [19],
    'club_value_adjustment_factor': [300, 600, 900],
    'decay_method': [DecayMethod.BASE_RATING, DecayMethod.MIN_BASE_CURRENT],
}

# Initialize best parameters and best correlation
best_params = None
best_correlation = -1

# Iterate over all combinations of parameters
for params in ParameterGrid(param_grid):
    k = params['k']
    decay_half_life = params['decay_half_life']
    club_value_adjustment_factor = params['club_value_adjustment_factor']
    decay_method = params['decay_method']
    
    # Process the fixture results for the 2022-2023 season and the 2023-2024 season
    df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

    # Get adjusted Elo dict
    adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

    df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

    # Calculate Elo vs Outcome correlation
    data_2022 = df[["Home Elo", "Away Elo", "Home Outcome"]].copy()
    data_2022["Season"] = "2022-2023"
    data_2023 = df_2023[["Home Elo", "Away Elo", "Home Outcome"]].copy()
    data_2023["Season"] = "2023-2024"
    data_2022_2024 = pd.concat([data_2022, data_2023])

    data_2022_2024["Elo Difference"] = data_2022_2024["Home Elo"] - data_2022_2024["Away Elo"]

    # Convert 'Home Outcome' to categorical type with specified categories
    data_2022_2024['Home Outcome'] = pd.Categorical(data_2022_2024['Home Outcome'], categories=[3, 1, 0], ordered=True)

    # Calculate correlation
    correlation, p_value = pearsonr(data_2022_2024["Elo Difference"], data_2022_2024["Home Outcome"])
    
    # If the correlation is better than the best found so far, update best correlation and best parameters
    if correlation > best_correlation:
        best_correlation = correlation
        best_params = params

print("Best parameters:", best_params)
print("Best correlation:", best_correlation)

# Calculate everything again with the best parameters
k = best_params['k']
decay_half_life = best_params['decay_half_life']
club_value_adjustment_factor = best_params['club_value_adjustment_factor']
decay_method = best_params['decay_method']

# Process the fixture results for the 2022-2023 season and the 2023-2024 season
df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

# Get adjusted Elo dict
adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

# Calculate Elo vs Outcome correlation
correlation_df_columns = ["Home Elo", "Away Elo", "Home Outcome", "Home", "Away", "Home Score", "Away Score", "Utc Date"]
data_2022 = df[correlation_df_columns].copy()
data_2022["Season"] = "2022-2023"
data_2023 = df_2023[correlation_df_columns].copy()
data_2023["Season"] = "2023-2024"
data_2022_2024 = pd.concat([data_2022, data_2023])

data_2022_2024["Match Info"] = data_2022_2024["Home"] + " (" + data_2022_2024["Home Score"].astype(str) + ") - " + data_2022_2024["Away"] + " (" + data_2022_2024["Away Score"].astype(str) + ") on " + data_2022_2024["Utc Date"].astype(str)

data_2022_2024["Elo Difference"] = data_2022_2024["Home Elo"] - data_2022_2024["Away Elo"]

# Convert 'Home Outcome' to categorical type with specified categories
data_2022_2024['Home Outcome'] = pd.Categorical(data_2022_2024['Home Outcome'], categories=[3, 1, 0], ordered=True)

# Calculate correlation
correlation, p_value = pearsonr(data_2022_2024["Elo Difference"], data_2022_2024["Home Outcome"])

# Convert 'Home Outcome' to string type
data_2022_2024['Home Outcome'] = data_2022_2024['Home Outcome'].astype(str)

results

Best parameters: {'club_value_adjustment_factor': 300, 'decay_half_life': 19, 'decay_method': <DecayMethod.BASE_RATING: 2>, 'k': 40}
Best correlation: 0.625952684095195


Unnamed: 0_level_0,Home Outcome,Home Elo,Away Outcome,Away Elo,Total Outcome,Total Elo
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,52,1696.335074,37,1649.687232,89,3346.022306
Arsenal FC,45,1587.086952,39,1574.586979,84,3161.673931
Manchester United FC,48,1608.114731,27,1587.043538,75,3195.15827
Newcastle United FC,39,1592.316323,32,1581.749808,71,3174.066131
Liverpool FC,44,1622.532541,23,1606.63544,67,3229.167982
Brighton & Hove Albion FC,34,1563.564545,28,1542.74759,62,3106.312135
Aston Villa FC,38,1592.753052,23,1576.970629,61,3169.723681
Tottenham Hotspur FC,37,1497.688151,23,1513.031989,60,3010.720139
Brentford FC,37,1583.187264,22,1559.503813,59,3142.691077
Fulham FC,29,1514.710629,23,1499.371743,52,3014.082372


#### Plotting Elo vs Outcome

In [4]:
import plotly.express as px
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
fig = px.scatter(
    data_2022_2024, x="Elo Difference", y="Home Outcome", color="Season",
    hover_data=["Match Info"], title=f"Elo Difference vs Outcome - Correlation: {correlation:.2f}, P-value: {p_value:.2f}"
)
fig.show()

<Figure size 1000x600 with 0 Axes>

## Training

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[["Home Elo", "Away Elo"]]
y = df["Home Outcome"]

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# Train the model
model = RandomForestClassifier(n_estimators=1000, max_depth=5, min_samples_split=5)
model.fit(x_train, y_train)

# Validate the model
val_score = model.score(x_val, y_val)
print(f"Validation accuracy: {val_score * 100:.2f}%")

Validation accuracy: 69.74%


### Build Elo

In [6]:
elo_df = build_elo_between_seasons(df, df_2023, club_value_adjustment_factor)
elo_df

Unnamed: 0_level_0,Elo,Club Value,Normalized Club Value,Exponential Club Value,Normalized Exponential Club Value,Adjusted Elo
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,1673.011153,1320000000.0,1.0,1.718282,1.0,1973.011153
Arsenal FC,1580.836966,1160000000.0,0.867159,1.38014,0.803209,1821.799755
Liverpool FC,1614.583991,941900000.0,0.686081,0.985917,0.573781,1786.718213
Manchester United FC,1597.579135,718650000.0,0.500726,0.649919,0.378238,1711.05053
Newcastle United FC,1587.033066,648950000.0,0.442858,0.557151,0.324249,1684.3077
Aston Villa FC,1584.86184,655600000.0,0.448379,0.565772,0.329266,1683.64166
Tottenham Hotspur FC,1505.36007,793300000.0,0.562705,0.755414,0.439634,1637.250149
Chelsea FC,1459.198744,946000000.0,0.689485,0.992689,0.577722,1632.515252
Brighton & Hove Albion FC,1553.156068,511750000.0,0.328947,0.389504,0.226682,1621.160738
Brentford FC,1571.345539,414880000.0,0.24852,0.282127,0.164191,1620.602864


## Test Against 2023-2024 Season

### Predict 2023-2024 Season

In [9]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from functools import partial

from simulation_utils import simulate_and_get_results

# Get adjusted Elo dict
adjusted_elo = get_elo_dict_from_df(elo_df)

# Number of simulations to run
num_simulations = 100

# Initialize a list to store results
seasons = []

# Create a partial function to pass the same arguments to each simulation
simulate_and_get_results_partial = partial(simulate_and_get_results, df=df_2023, elo=adjusted_elo, model=model, scaler=scaler, k=k, half_life=decay_half_life, decay_method=decay_method)

# Initialize a pool of workers
with ProcessPoolExecutor() as executor:
    seasons = list(tqdm(executor.map(simulate_and_get_results_partial, range(num_simulations)),  total=num_simulations, desc='Simulating', unit='season'))

Simulating: 100%|██████████| 100/100 [09:56<00:00,  5.97s/season]


### Analyze Results Compared to actual 2023-2024 Season

### Post Process Simulation Results

In [10]:
# Sort results based on total outcome
results = results.sort_values("Total Outcome", ascending=False)

# Get the place each team finished in the league
results["Place"] = range(1, len(results) + 1)

# Get 2023 season results
# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df_2023["Home Outcome"] = 1
df_2023["Away Outcome"] = 1
df_2023.loc[df_2023["Home Score"] > df_2023["Away Score"], "Home Outcome"] = 3
df_2023.loc[df_2023["Home Score"] > df_2023["Away Score"], "Away Outcome"] = 0
df_2023.loc[df_2023["Away Score"] > df_2023["Home Score"], "Away Outcome"] = 3
df_2023.loc[df_2023["Away Score"] > df_2023["Home Score"], "Home Outcome"] = 0

home_results = df_2023.groupby("Home").agg({"Home Outcome": "sum"})
away_results = df_2023.groupby("Away").agg({"Away Outcome": "sum"})
results_2023 = home_results.join(away_results, how="outer").fillna(0)
results_2023["Total Outcome"] = results_2023["Home Outcome"] + results_2023["Away Outcome"]
results_2023 = results_2023.sort_values("Total Outcome", ascending=False)
results_2023["Place"] = range(1, len(results_2023) + 1)

# For each season, get the place each team finished in the league
for index, season_df in enumerate(seasons):
    season_df["Place"] = range(1, len(season_df) + 1)

# Get the average place each team finished in the league
average_results = pd.concat(seasons).groupby("Team").agg({"Place": "mean"}).sort_values("Place")

# Get a mapping of team names to a list places they finished in the league
team_place_mapping = {}
for team in average_results.index:
    team_place_mapping[team] = [season_df.loc[team, "Place"] for season_df in seasons]

# Get the total number of seasons simulated
total_seasons = len(seasons)

# Get a mapping of times each team won the league
team_win_mapping = {}
for team in average_results.index:
    team_win_mapping[team] = sum([season_df.loc[team, "Place"] == 1 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the top 4
team_top_4_mapping = {}
for team in average_results.index:
    team_top_4_mapping[team] = sum([season_df.loc[team, "Place"] <= 4 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the bottom 3
team_bottom_3_mapping = {}
for team in average_results.index:
    team_bottom_3_mapping[team] = sum([season_df.loc[team, "Place"] >= len(season_df) - 3 for season_df in seasons]) / total_seasons

# Build a dataframe with the average place, times won, times in top 4, and times in bottom 3
average_results["Win Premier League"] = [team_win_mapping[team] for team in average_results.index]
average_results["Top 4"] = [team_top_4_mapping[team] for team in average_results.index]
average_results["Bottom 3"] = [team_bottom_3_mapping[team] for team in average_results.index]

# Show the dataframe
average_results

Unnamed: 0_level_0,Place,Win Premier League,Top 4,Bottom 3
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Manchester City FC,3.03,0.26,0.81,0.0
Arsenal FC,3.19,0.2,0.79,0.0
Liverpool FC,4.55,0.11,0.58,0.0
Manchester United FC,4.77,0.14,0.47,0.0
Newcastle United FC,5.45,0.13,0.38,0.0
Aston Villa FC,5.88,0.07,0.31,0.0
Tottenham Hotspur FC,7.33,0.02,0.21,0.0
Chelsea FC,7.54,0.02,0.18,0.0
Brighton & Hove Albion FC,7.89,0.03,0.12,0.0
Brentford FC,8.89,0.02,0.09,0.0
