Proof that pre-summer results are useless in predicting after-summer results


In [1]:
import pandas as pd
import numpy as np
import math

import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import product
from scipy.stats import skellam
from sklearn.model_selection import KFold
from datetime import datetime

In [2]:
data_raw = pd.read_csv('nwsl2022.csv')     # update file path to where you saved the data
data = data_raw.copy()
data = data.loc[(data['Day'].notna()) & (data['Day'] != 'Day')]
data.loc[data['Score'].str[0] == '(', 'Score'] = data['Score'].str[4:7:1]
data.loc[:, 'home_goals'] = data['Score'].str.split('–').str[0].astype(int)
data.loc[:, 'away_goals'] = data['Score'].str.split('–').str[1].astype(int)

In [4]:
def train_and_predict(data, alpha_coeff):
    '''Train a Poisson Bradley-Terry model and produce predictions
    Args:
        data (pandas df): dataframe with cols 'Home', 'Away', 'home_goals', 'away_goals'
    Returns:
        pred (pandas df): dataframe with cols 'Home', 'Away',
            'prob_home_win', 'prob_away_win', 'prob_draw'
    '''
    goal_model_data = pd.concat(
    objs=[data[['Home','Away','home_goals']].assign(home=1).rename(
            columns={'Home':'offense', 'Away':'defense', 'home_goals':'goals'}
        ),
        data[['Away','Home','away_goals']].assign(home=0
        ).rename(columns={'Away': 'offense', 'Home':'defense', 'away_goals':'goals'})])

    poisson_model = smf.glm(formula="goals ~ home + defense + offense", data=goal_model_data,
                        family=sm.families.Poisson()).fit_regularized(alpha=alpha_coeff, L1_wt=0)
    all_teams = np.unique(data['Home'])

    pred_data = pd.DataFrame(product([1, 0], all_teams, all_teams),
        columns=['home', 'offense', 'defense']
    ).query('offense != defense' )

    pred_raw = pred_data.assign(pred_goals = poisson_model.predict(exog=pred_data))

    pred_home = pred_raw.query('home == 1').rename(
        columns={'offense':'Home', 'defense':'Away', 'pred_goals':'pred_goals_home'}
    ).loc[:, ['Home', 'Away', 'pred_goals_home']]

    pred_away = pred_raw.query('home == 0').rename(
        columns={'defense':'Home', 'offense':'Away', 'pred_goals':'pred_goals_away'}
    ).loc[:, ['Home', 'Away', 'pred_goals_away']]


    pred = pd.merge(pred_home, pred_away, on=['Home', 'Away']).assign(
        prob_home_win=lambda x: [
            sum([skellam.pmf(diff, x['pred_goals_home'][i], x['pred_goals_away'][i]) for diff in range(1, 10)]) for i in range(0, x.shape[0])
        ],
        prob_away_win=lambda x: [
            sum([skellam.pmf(diff, x['pred_goals_home'][i], x['pred_goals_away'][i]) for diff in range(-1, -10, -1)]) for i in range(0, x.shape[0])
        ],
        prob_draw=lambda x: [
            skellam.pmf(0, x['pred_goals_home'][i], x['pred_goals_away'][i]) for i in range(0, x.shape[0])
        ]
        ).loc[:, ['Home', 'Away', 'prob_home_win', 'prob_away_win', 'prob_draw']]

    return(pred)

In [5]:
def columnselect(row):
    if row['home_goals'] > row['away_goals']:
        return row['prob_home_win']
    elif row['home_goals'] < row['away_goals']:
        return row['prob_away_win']
    else:
        return row['prob_draw']

In [7]:
date_format = '%m/%d/%Y'
data['Date'] = data['Date'].apply(lambda x: datetime.strptime(x, date_format).date())

Below prints the average log probability with a model trained only with July ~ September data.



In [31]:
date0601 = datetime.strptime('06/01/2022', date_format).date()

date0901 = datetime.strptime('09/01/2022', date_format).date()
recent_traindata = data.loc[(data['Date']>date0601) & (data['Date']<date0901)]
# real_pred = train_and_predict(data.loc[data['Date']<date0901], 0.9)
real_pred = train_and_predict(recent_traindata, 0.9)

real_test = data.loc[data['Date']>=date0901]

# Calculate the log of the predicted probability for the outcome that occurred
real_cv_data = pd.merge(real_test, real_pred, on = ['Home', 'Away'], how='left')

real_cv_data['prob'] = real_cv_data.apply(columnselect, axis=1)
real_cv_data['log_prob'] = real_cv_data['prob'].apply(math.log)
real_cv_data = real_cv_data[['Date', 'Home', 'Away', 'log_prob']]
print(np.mean(real_cv_data['log_prob']))
# real_cv_data

-0.9991142977097732


Below attempts to optimize the previous model with older data (including before July) and fails; we can never reach a better log likelihood than the one printed above. Any data older than July is useless in our Skellam model.

In [33]:
for rep in range(0, 1000, 100):
  real_pred = train_and_predict(pd.concat([recent_traindata] * rep + [data.loc[data['Date']<date0901]]), 0.9)
  real_test = data.loc[data['Date']>=date0901]

  # Calculate the log of the predicted probability for the outcome that occurred
  real_cv_data = pd.merge(real_test, real_pred, on = ['Home', 'Away'], how='left')

  real_cv_data['prob'] = real_cv_data.apply(columnselect, axis=1)
  real_cv_data['log_prob'] = real_cv_data['prob'].apply(math.log)
  real_cv_data = real_cv_data[['Date', 'Home', 'Away', 'log_prob']]
  print(np.mean(real_cv_data['log_prob']))


-1.0199872683642905
-0.99940234821033
-0.9992593228125243
-0.9992112053603819
-0.9991870628336343
-0.9991725503893171
-0.99916286418027
-0.9991559399429677
-0.9991507437449694
-0.9991467004666126
