In [1]:
%load_ext autoreload
%autoreload 2

## Setting up Training Data

In [17]:
from simulation_utils import db_get_data_by_year
import pandas as pd

# Get data for 2022-2023 season
df = db_get_data_by_year(2022)

# Get data for the 2023-2024 season
df_2023 = db_get_data_by_year(2023)

pd.concat([df, df_2023])

### Analyze Elo vs Outcome for Real Results

In [31]:
from scipy.stats import pearsonr
from sklearn.model_selection import ParameterGrid

from simulation_utils import DecayMethod, build_elo_between_seasons, get_elo_dict_from_df, process_fixture_results, upload_best_params_to_s3

# Define the parameter grid
param_grid = {
    'k': [40],
    'decay_half_life': [38/4],
    'club_value_adjustment_factor': [300],
    'decay_method': [DecayMethod.BASE_RATING],
}

# Initialize best parameters and best correlation
best_params = None
best_correlation = -1

# Iterate over all combinations of parameters
for params in ParameterGrid(param_grid):
    k = params['k']
    decay_half_life = params['decay_half_life']
    club_value_adjustment_factor = params['club_value_adjustment_factor']
    decay_method = params['decay_method']
    
    # Process the fixture results for the 2022-2023 season and the 2023-2024 season
    df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

    # Get adjusted Elo dict at current club value
    adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

    df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

    # Calculate Elo vs Outcome correlation
    data_2022 = df[["home_elo", "away_elo", "home_outcome"]].copy()
    data_2022["season"] = "2022-2023"
    data_2023 = df_2023[["home_elo", "away_elo", "home_outcome"]].copy()
    data_2023["season"] = "2023-2024"
    data_2022_2024 = pd.concat([data_2022, data_2023])

    data_2022_2024["elo_difference"] = data_2022_2024["home_elo"] - data_2022_2024["away_elo"]

    # Convert "home_outcome" to categorical type with specified categories
    data_2022_2024["home_outcome"] = pd.Categorical(data_2022_2024["home_outcome"], categories=[3, 1, 0], ordered=True)

    # Calculate correlation
    correlation, p_value = pearsonr(data_2022_2024["elo_difference"], data_2022_2024["home_outcome"])
    
    # If the correlation is better than the best found so far, update best correlation and best parameters
    if correlation > best_correlation:
        best_correlation = correlation
        best_params = params

print("Best parameters:", best_params)
print("Best correlation:", best_correlation)

# # Calculate everything again with the best parameters
# k = best_params['k']
# decay_half_life = best_params['decay_half_life']
# club_value_adjustment_factor = best_params['club_value_adjustment_factor']
# decay_method = best_params['decay_method']

# # Process the fixture results for the 2022-2023 season and the 2023-2024 season
# df, results = process_fixture_results(df, k, decay_half_life, club_value_adjustment_factor, decay_method, None)

# # Get adjusted Elo dict at current club value
# adjusted_elo = get_elo_dict_from_df(build_elo_between_seasons(df, df_2023, club_value_adjustment_factor))

# df_2023, results_2023 = process_fixture_results(df_2023, k, decay_half_life, club_value_adjustment_factor, decay_method, adjusted_elo)

# Calculate Elo vs Outcome correlation
correlation_df_columns = ["home_elo", "away_elo", "home_outcome", "home", "away", "home_score", "away_score", "utc_date"]
data_2022 = df[correlation_df_columns].copy()
data_2022["season"] = "2022-2023"
data_2023 = df_2023[correlation_df_columns].copy()
data_2023["season"] = "2023-2024"
data_2022_2024 = pd.concat([data_2022, data_2023])

data_2022_2024["match_info"] = data_2022_2024["home"] + " (" + data_2022_2024["home_score"].astype(str) + ") - " + data_2022_2024["away"] + " (" + data_2022_2024["away_score"].astype(str) + ") on " + data_2022_2024["utc_date"].astype(str)

data_2022_2024["elo_difference"] = data_2022_2024["home_elo"] - data_2022_2024["away_elo"]

# Convert "home_outcome" to categorical type with specified categories
data_2022_2024["home_outcome"] = pd.Categorical(data_2022_2024["home_outcome"], categories=[3, 1, 0], ordered=True)

# Calculate correlation
correlation, p_value = pearsonr(data_2022_2024["elo_difference"], data_2022_2024["home_outcome"])

# Convert "home_outcome" to string type
data_2022_2024["home_outcome"] = data_2022_2024["home_outcome"].astype(str)

results

Best parameters: {'club_value_adjustment_factor': 300, 'decay_half_life': 9.5, 'decay_method': <DecayMethod.BASE_RATING: 1>, 'k': 40}
Best correlation: 0.5436362891412282


Unnamed: 0_level_0,home_outcome,home_elo,away_outcome,away_elo,total_outcome,total_elo
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Manchester City FC,193.0,1627.536177,169.0,1625.018,362.0,3252.554177
Arsenal FC,186.0,1619.21519,165.0,1613.162314,351.0,3232.377504
Liverpool FC,188.0,1554.352818,125.0,1543.608059,313.0,3097.960877
Aston Villa FC,158.0,1521.373248,107.0,1502.423012,265.0,3023.79626
Tottenham Hotspur FC,154.0,1483.086006,104.0,1497.74528,258.0,2980.831286
Manchester United FC,147.0,1506.672892,108.0,1522.822359,255.0,3029.495251
Newcastle United FC,159.0,1535.935357,92.0,1528.947655,251.0,3064.883012
Chelsea FC,136.0,1581.500988,97.0,1572.523588,233.0,3154.024576
Brighton & Hove Albion FC,124.0,1455.167931,82.0,1482.846715,206.0,2938.014646
West Ham United FC,115.0,1481.358987,81.0,1471.355631,196.0,2952.714618


In [33]:
# Upload best params to s3
upload_best_params_to_s3(best_params, True)

Best Params uploaded to s3://pl-prediction/2024/random_forest.joblib


#### Plotting Elo vs Outcome

In [32]:
import plotly.express as px
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
fig = px.scatter(
    data_2022_2024, x="elo_difference", y="home_outcome", color="season",
    hover_data=["match_info"], title=f"Elo Difference vs Outcome - Correlation: {correlation:.2f}, P-value: {p_value:.2f}"
)
fig.show()

<Figure size 1000x600 with 0 Axes>

## Training

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Define the feature columns
feature_columns = ["home_elo", "away_elo", "home_position", "away_position", "home_manager_tenure", "away_manager_tenure", "home_form", "away_form"]

# Combine the dataframes
df = pd.concat([df, df_2023])

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[feature_columns]
y = df["home_outcome"]

# Standardize the features
scaler = StandardScaler()
x = scaler.fit_transform(x)

# Define the parameter grid for Random Forest
# Current parameters: n_estimators=1000, max_depth=5, min_samples_split=5
param_grid_rf = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier()

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')

# Perform the grid search
grid_search.fit(x, y)

# Extract best parameters and model
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

best_rf = grid_search.best_estimator_

# Print feature importance
feature_importance = best_rf.feature_importances_
print("Feature importance")
for i, feature in enumerate(feature_columns):
    print(f"{feature}: {feature_importance[i]}")

Best parameters found: {'bootstrap': True, 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
Best cross-validation accuracy: 0.9328947368421053
Feature importance
home_elo: 0.268975994250035
away_elo: 0.23495329974415735
home_position: 0.09992848902928747
away_position: 0.08891009699512055
home_manager_tenure: 0.078311208025058
away_manager_tenure: 0.07761714087853015
home_form: 0.07880476371831084
away_form: 0.07249900735950071


### Store the Model

In [34]:
from joblib import dump
from simulation_utils import upload_model_and_scaler_to_s3

# Save the model
model_filename = "random_forest.joblib"
dump(best_rf, model_filename)

# Save the scalar
scalar_filename = "standard_scaler.joblib"
dump(scaler, scalar_filename)

# Upload the model and scalar to S3
upload_model_and_scaler_to_s3(model_filename, scalar_filename, should_delete=True)

Model uploaded to s3://pl-prediction/2024/random_forest.joblib
Scaler uploaded to s3://pl-prediction/2024/standard_scaler.joblib
