In [1]:
from simulation_utils import db_get_data_by_year

# Get data for 2022-2023 season
df = db_get_data_by_year(2022)

# Get data for the 2023-2024 season
df_2023 = db_get_data_by_year(2023)

## Setting up Training Data

### Analyze Elo vs Outcome for Real Results

In [2]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.model_selection import ParameterGrid

from simulation_utils import DecayMethod, build_elo_between_seasons, get_elo_dict_from_df, process_fixture_results

# Define the parameter grid
param_grid = {
    'k': [40],
    'decay_half_life': [19],
    'club_value_adjustment_factor': [300],
    'decay_method': [DecayMethod.BASE_RATING],
}

# Initialize best parameters and best correlation
best_params = None
best_correlation = -1

# Iterate over all combinations of parameters
for params in ParameterGrid(param_grid):
    k = params['k']
    decay_half_life = params['decay_half_life']
    club_value_adjustment_factor = params['club_value_adjustment_factor']
    decay_method = params['decay_method']
    
    # Process the fixture results for the 2022-2023 season and the 2023-2024 season
    df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

    # Get adjusted Elo dict at current club value
    adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

    df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

    # Calculate Elo vs Outcome correlation
    data_2022 = df[["home_elo", "away_elo", "home_outcome"]].copy()
    data_2022["season"] = "2022-2023"
    data_2023 = df_2023[["home_elo", "away_elo", "home_outcome"]].copy()
    data_2023["season"] = "2023-2024"
    data_2022_2024 = pd.concat([data_2022, data_2023])

    data_2022_2024["elo_difference"] = data_2022_2024["home_elo"] - data_2022_2024["away_elo"]

    # Convert "home_outcome" to categorical type with specified categories
    data_2022_2024["home_outcome"] = pd.Categorical(data_2022_2024["home_outcome"], categories=[3, 1, 0], ordered=True)

    # Calculate correlation
    correlation, p_value = pearsonr(data_2022_2024["elo_difference"], data_2022_2024["home_outcome"])
    
    # If the correlation is better than the best found so far, update best correlation and best parameters
    if correlation > best_correlation:
        best_correlation = correlation
        best_params = params

print("Best parameters:", best_params)
print("Best correlation:", best_correlation)

# Calculate everything again with the best parameters
k = best_params['k']
decay_half_life = best_params['decay_half_life']
club_value_adjustment_factor = best_params['club_value_adjustment_factor']
decay_method = best_params['decay_method']

# Process the fixture results for the 2022-2023 season and the 2023-2024 season
df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

# Get adjusted Elo dict at current club value
adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

# Calculate Elo vs Outcome correlation
correlation_df_columns = ["home_elo", "away_elo", "home_outcome", "home", "away", "home_score", "away_score", "utc_date"]
data_2022 = df[correlation_df_columns].copy()
data_2022["season"] = "2022-2023"
data_2023 = df_2023[correlation_df_columns].copy()
data_2023["season"] = "2023-2024"
data_2022_2024 = pd.concat([data_2022, data_2023])

data_2022_2024["match_info"] = data_2022_2024["home"] + " (" + data_2022_2024["home_score"].astype(str) + ") - " + data_2022_2024["away"] + " (" + data_2022_2024["away_score"].astype(str) + ") on " + data_2022_2024["utc_date"].astype(str)

data_2022_2024["elo_difference"] = data_2022_2024["home_elo"] - data_2022_2024["away_elo"]

# Convert "home_outcome" to categorical type with specified categories
data_2022_2024["home_outcome"] = pd.Categorical(data_2022_2024["home_outcome"], categories=[3, 1, 0], ordered=True)

# Calculate correlation
correlation, p_value = pearsonr(data_2022_2024["elo_difference"], data_2022_2024["home_outcome"])

# Convert "home_outcome" to string type
data_2022_2024["home_outcome"] = data_2022_2024["home_outcome"].astype(str)

results

Best parameters: {'club_value_adjustment_factor': 300, 'decay_half_life': 19, 'decay_method': <DecayMethod.BASE_RATING: 2>, 'k': 40}
Best correlation: 0.6175039074681774


Unnamed: 0_level_0,home_outcome,home_elo,away_outcome,away_elo,total_outcome,total_elo
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,52,1691.387023,37,1645.596037,89,3336.98306
Arsenal FC,45,1584.251581,39,1571.345875,84,3155.597456
Manchester United FC,48,1611.820563,27,1591.289918,75,3203.110481
Newcastle United FC,39,1592.880068,32,1582.458521,71,3175.33859
Liverpool FC,44,1622.5861,23,1606.745763,67,3229.331863
Brighton & Hove Albion FC,34,1564.162407,28,1543.237976,62,3107.400383
Aston Villa FC,38,1591.918598,23,1576.015916,61,3167.934514
Tottenham Hotspur FC,37,1496.564465,23,1512.041609,60,3008.606074
Brentford FC,37,1583.270673,22,1559.837282,59,3143.107955
Fulham FC,29,1514.25899,23,1499.174074,52,3013.433064


#### Plotting Elo vs Outcome

In [3]:
import plotly.express as px
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
fig = px.scatter(
    data_2022_2024, x="elo_difference", y="home_outcome", color="season",
    hover_data=["match_info"], title=f"Elo Difference vs Outcome - Correlation: {correlation:.2f}, P-value: {p_value:.2f}"
)
fig.show()

<Figure size 1000x600 with 0 Axes>

## Training

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[["home_elo", "away_elo", "home_position", "away_position"]]
y = df["home_outcome"]

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# Define the parameter grid for Random Forest
# Current parameters: n_estimators=1000, max_depth=5, min_samples_split=5
param_grid_rf = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier()

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')

# Perform the grid search
grid_search.fit(x_train, y_train)

# Extract best parameters and model
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

best_rf = grid_search.best_estimator_

# Cross-validate the model against 2023-2024 season
x_2023 = df_2023[["home_elo", "away_elo", "home_position", "away_position"]]
y_2023 = df_2023["home_outcome"]

# Standardize the features
x_2023 = scaler.transform(x_2023)

# Validate the model
val_score_2023 = best_rf.score(x_2023, y_2023)
print(f"Validation accuracy on 2023-2024 data: {val_score_2023 * 100:.2f}%")

Best parameters found: {'bootstrap': False, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation accuracy: 0.6020765027322404
Validation accuracy on 2023-2024 data: 64.47%


### Build Elo

In [5]:
elo_df = build_elo_between_seasons(df, df_2023, club_value_adjustment_factor)
elo_df

Unnamed: 0_level_0,elo,club_value,normalized_club_value,exponential_club_value,normalized_exponential_club_value,adjusted_elo
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,1668.49153,1321200000.0,1.0,1.718282,1.0,1968.49153
Arsenal FC,1577.798728,1159300000.0,0.865716,1.376706,0.801211,1818.162014
Liverpool FC,1614.665931,871900000.0,0.627338,0.872619,0.507844,1767.019099
Manchester United FC,1601.55524,724650000.0,0.505205,0.657325,0.382548,1716.319535
Newcastle United FC,1587.669295,648950000.0,0.442417,0.556465,0.323849,1684.824122
Aston Villa FC,1583.967257,655600000.0,0.447933,0.565073,0.328859,1682.625104
Chelsea FC,1463.06222,946000000.0,0.688799,0.991322,0.576926,1636.140056
Tottenham Hotspur FC,1504.303037,793300000.0,0.562145,0.754432,0.439062,1636.021514
Brighton & Hove Albion FC,1553.700192,511750000.0,0.328619,0.389049,0.226418,1621.625447
Brentford FC,1571.553978,414875000.0,0.248269,0.281804,0.164003,1620.755011


## Test Against 2023-2024 Season

### Predict 2023-2024 Season

In [6]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from functools import partial

from simulation_utils import simulate_and_get_results

# Get the model
model = best_rf

# Get adjusted Elo dict
adjusted_elo = get_elo_dict_from_df(elo_df)

# Number of simulations to run
num_simulations = 100

# Initialize a list to store results
seasons = []

# Create a partial function to pass the same arguments to each simulation
simulate_and_get_results_partial = partial(simulate_and_get_results, df=df_2023, elo=adjusted_elo, model=model, scaler=scaler, k=k, half_life=decay_half_life, decay_method=decay_method)

# Initialize a pool of workers
with ProcessPoolExecutor() as executor:
    seasons = list(tqdm(executor.map(simulate_and_get_results_partial, range(num_simulations)),  total=num_simulations, desc='Simulating', unit='season'))

Simulating: 100%|██████████| 100/100 [00:28<00:00,  3.52season/s]


### Analyze Results Compared to actual 2023-2024 Season

### Post Process Simulation Results

In [7]:
# Aggregate the results
results = pd.concat(seasons).groupby("team").agg({"total_outcome": "sum"})

# Sort results based on total outcome
results = results.sort_values("total_outcome", ascending=False)

# Get the place each team finished in the league
results["place"] = range(1, len(results) + 1)

# Get 2023 season results
# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df_2023["home_outcome"] = 1
df_2023["away_outcome"] = 1
df_2023.loc[df_2023["home_score"] > df_2023["away_score"], "home_outcome"] = 3
df_2023.loc[df_2023["home_score"] > df_2023["away_score"], "away_outcome"] = 0
df_2023.loc[df_2023["away_score"] > df_2023["home_score"], "away_outcome"] = 3
df_2023.loc[df_2023["away_score"] > df_2023["home_score"], "home_outcome"] = 0

home_results = df_2023.groupby("home").agg({"home_outcome": "sum"})
away_results = df_2023.groupby("away").agg({"away_outcome": "sum"})
results_2023 = home_results.join(away_results, how="outer").fillna(0)
results_2023["total_outcome"] = results_2023["home_outcome"] + results_2023["away_outcome"]
results_2023 = results_2023.sort_values("total_outcome", ascending=False)
results_2023["place"] = range(1, len(results_2023) + 1)

# For each season, get the place each team finished in the league
for index, season_df in enumerate(seasons):
    season_df["place"] = range(1, len(season_df) + 1)

# Get the average place each team finished in the league
average_results = pd.concat(seasons).groupby("team").agg({"place": "mean"}).sort_values("place")

# Get the season from seasons
list_of_seasons = df_2023["season"].unique()

assert len(list_of_seasons) == 1, "All seasons should be the same"

# Get the season
simulation_season = list_of_seasons[0]

# Get a mapping of team names to a list places they finished in the league
team_place_mapping = {}
for team in average_results.index:
    team_place_mapping[team] = [season_df.loc[team, "place"] for season_df in seasons]

# Get the total number of seasons simulated
total_seasons = len(seasons)

# Get a mapping of times each team won the league
team_win_mapping = {}
for team in average_results.index:
    team_win_mapping[team] = sum([season_df.loc[team, "place"] == 1 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the top 4
team_top_4_mapping = {}
for team in average_results.index:
    team_top_4_mapping[team] = sum([season_df.loc[team, "place"] <= 4 for season_df in seasons]) / total_seasons

# Get a mapping of times each team finished in the bottom 3
team_bottom_3_mapping = {}
for team in average_results.index:
    team_bottom_3_mapping[team] = sum([season_df.loc[team, "place"] >= len(season_df) - 3 for season_df in seasons]) / total_seasons

# Build a dataframe with the average place, times won, times in top 4, and times in bottom 3
average_results["win_premier_league"] = [team_win_mapping[team] for team in average_results.index]
average_results["top_4"] = [team_top_4_mapping[team] for team in average_results.index]
average_results["bottom_3"] = [team_bottom_3_mapping[team] for team in average_results.index]

# Show the dataframe
average_results

Unnamed: 0_level_0,place,win_premier_league,top_4,bottom_3
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Manchester City FC,4.58,0.24,0.54,0.0
Manchester United FC,5.6,0.1,0.5,0.03
Arsenal FC,5.99,0.12,0.43,0.0
Liverpool FC,6.03,0.14,0.41,0.01
Aston Villa FC,6.38,0.08,0.4,0.0
Newcastle United FC,7.32,0.06,0.26,0.0
Chelsea FC,7.91,0.06,0.29,0.03
Tottenham Hotspur FC,8.32,0.05,0.23,0.03
Brentford FC,8.58,0.04,0.18,0.05
Brighton & Hove Albion FC,9.15,0.04,0.24,0.06


### Find the Positions of the Teams in the 2023-2024 Season for all Seasons

In [8]:
team_to_position = {team: {} for team in df_2023["home"].unique()}
for season in seasons:
    # Assign the position of each team
    for i, team in enumerate(season.index, 1):
        if i not in team_to_position[team]:
            team_to_position[team][i] = 0
        team_to_position[team][i] += 1

# Create a dataframe where each row is a team and the one column is an array of positions in which the team finished in the league
team_positions_df = pd.DataFrame.from_dict(team_to_position, orient="index").fillna(0).astype(int)

# Stack the dataframe to get a row for each team-position pair
team_positions_df = team_positions_df.stack().reset_index().rename(columns={'level_0': 'team', 'level_1': 'position', 0: 'count'})

# Set the index to the team name-position pair
team_positions_df.set_index(["team", "position"], inplace=True)

# Sort the position index
team_positions_df = team_positions_df.sort_index()

team_positions_df

Unnamed: 0_level_0,Unnamed: 1_level_0,count
team,position,Unnamed: 2_level_1
AFC Bournemouth,1,0
AFC Bournemouth,2,0
AFC Bournemouth,3,1
AFC Bournemouth,4,1
AFC Bournemouth,5,2
...,...,...
Wolverhampton Wanderers FC,16,10
Wolverhampton Wanderers FC,17,5
Wolverhampton Wanderers FC,18,3
Wolverhampton Wanderers FC,19,8


## Store the Data

In [9]:
from simulation_utils import db_store_results

# Store the results in the database
db_store_results(simulation_season, average_results, team_positions_df)