In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import brier_score_loss
from scipy.special import logit, expit
from tqdm import tqdm
import itertools


  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('hw4.csv', parse_dates = ['Date'])
df = df.sort_values('GameID')
df_prior = pd.read_csv('hw4_prior.csv')
df.head()

Unnamed: 0,Div,Date,Y,Team_Home,Team_Away,G_Home,G_Away,S_Home,S_Away,pH,pD,pA,xG_Home,xG_Away,GameID,GD_Home,GD_Away,SD_Home,SD_Away,xGD_Home,xGD_Away
0,Ligue_1,2014-08-08,14,Reims,Paris SG,2,2,9,16,0.089841,0.196675,0.713484,1.36787,2.65538,0,0,0,-7,7,-1.28751,1.28751
1,Ligue_1,2014-08-09,14,Montpellier,Bordeaux,0,1,15,7,0.386037,0.317102,0.296861,1.01756,0.750184,1,-1,1,8,-8,0.267376,-0.267376
2,Ligue_1,2014-08-09,14,Lille,Metz,0,0,14,2,0.636756,0.235893,0.127351,1.54468,0.057137,2,0,0,12,-12,1.487543,-1.487543
3,Ligue_1,2014-08-09,14,Guingamp,St Etienne,0,2,6,7,0.282701,0.312602,0.404697,0.63294,1.49532,3,-2,2,-1,1,-0.86238,0.86238
4,Ligue_1,2014-08-09,14,Evian Thonon Gaillard,Caen,0,3,10,12,0.399734,0.297363,0.302904,0.813737,1.23869,4,-3,3,-2,2,-0.424953,0.424953


# 1b.

In [3]:
result = smf.glm('GD_Home ~ I(logit(pH) - logit(pA)) - 1', data=df[df.Y < 18]).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD_Home,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7303.0
Model Family:,Gaussian,Df Model:,0.0
Link Function:,identity,Scale:,2.5139
Method:,IRLS,Log-Likelihood:,-13730.0
Date:,"Wed, 10 Mar 2021",Deviance:,18359.0
Time:,22:11:02,Pearson chi2:,18400.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
I(logit(pH) - logit(pA)),0.4897,0.009,53.405,0.000,0.472,0.508


## i.

In [4]:
result = smf.glm('GD_Home ~ SD_Home - 1', data=df[df.Y < 18]).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD_Home,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7303.0
Model Family:,Gaussian,Df Model:,0.0
Link Function:,identity,Scale:,3.0262
Method:,IRLS,Log-Likelihood:,-14407.0
Date:,"Wed, 10 Mar 2021",Deviance:,22101.0
Time:,22:11:02,Pearson chi2:,22100.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
SD_Home,0.0819,0.002,33.659,0.000,0.077,0.087


## ii.

In [5]:
result = smf.glm('GD_Home ~ xGD_Home - 1', data=df[df.Y < 18]).fit()
result.summary()

0,1,2,3
Dep. Variable:,GD_Home,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7303.0
Model Family:,Gaussian,Df Model:,0.0
Link Function:,identity,Scale:,1.7748
Method:,IRLS,Log-Likelihood:,-12459.0
Date:,"Wed, 10 Mar 2021",Deviance:,12962.0
Time:,22:11:02,Pearson chi2:,13000.0
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
xGD_Home,0.9938,0.012,84.147,0.000,0.971,1.017


# 1c.

In [6]:
def dfl_calc(df, df_prior, ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt, y_start = 15, y_end = 18, hfa_prior = 0.3739, lmp_to_goal = 0.489739, SD_to_goal = 0.0819, xGD_to_goal = 0.9938, team_prior_wt = 15, hfa_prior_wt = 500, wt_decay = 0.97):
  for div in tqdm(sorted(df.Div.unique())) : #Process each division
    for y in sorted(df.loc[df.Y.between(y_start, y_end)].Y.unique()) : #Process each year separately
      df_dy = df.loc[(df.Y == y) & (df.Div == div)].copy()
      n_games = len(df_dy)
      teams = sorted(set(df_dy.Team_Home.unique()) | set(df_dy.Team_Away.unique())) #All teams
      n_teams = len(teams)
      team_map = {k:i for i,k in enumerate(teams)} #team -> index
      
      n_rows = 4*n_games + n_teams + 1 #n_teams team priors, 1 hfa prior, 2 rows per game
      X = np.zeros((n_rows, n_teams + 1)) 
      Y = np.zeros(n_rows) 
      wgts = goal_wt*np.ones(n_rows) #by default, set all weights to goal_wt
      
      #Setup priors
      X[:n_teams+1,:n_teams+1] = np.eye(n_teams+1) #rows for priors
      df_prior_dy = df_prior.loc[(df_prior.Y == y) & (df_prior.Div == div)].copy()
      #Setup team priors
      wgts[:n_teams] = team_prior_wt
      for i in range(len(df_prior_dy)):
        team = df_prior_dy.Team.iloc[i]
        prior = df_prior_dy.priorGD.iloc[i]
        Y[team_map[team]] = prior
      #Setup hfa prior
      Y[n_teams] = hfa_prior
      wgts[n_teams] = hfa_prior_wt
      
      #Game rows are an alternating pattern of goal differentials, and market prices
      wgts[n_teams+2::4] = mkt_wt
      wgts[n_teams+3::4] = SD_wt
      wgts[n_teams+4::4] = xGD_wt
      #Starting ratings are just priors
      ratings = Y[:n_teams]

      prev_date = None
      ratings_home = np.empty(n_games)
      ratings_away = np.empty(n_games)

      #Process every game
      for i in range(n_games) : #dataframe is sorted chronologically
        curr_date = df_dy.Date.iloc[i]
        #Refit on all strictly earlier games if first game of new date
        if prev_date is not None and curr_date > prev_date :
          rating_model = sm.WLS(Y, X, weights = wgts).fit()
          ratings = rating_model.params[:n_teams]
          ratings = ratings - np.mean(ratings) #Center ratings around 0            
          #Decay weights of all games and priors using elapsed days
          wgts[:n_teams+1+4*i] *= wt_decay**((curr_date-prev_date)/pd.Timedelta(1,unit='day'))
        prev_date = curr_date
        
        home, away = df_dy.Team_Home.iloc[i], df_dy.Team_Away.iloc[i]
        i_home, i_away = team_map[home], team_map[away]
        ratings_home[i] = ratings[i_home]
        ratings_away[i] = ratings[i_away]
        
        X[n_teams+1+4*i:n_teams+1+4*i+4] = 1.0*(np.arange(n_teams+1)==i_home)-1.0*(np.arange(n_teams+1)==i_away)
        X[n_teams+1+4*i:n_teams+1+4*i+4, -1] = 1.0 #HFA
        Y[n_teams+1+4*i] = df_dy.GD_Home.iloc[i] #Goal differential
        Y[n_teams+1+4*i+1] = lmp_to_goal * (logit(df_dy.pH.iloc[i])-logit(df_dy.pA.iloc[i])) #market info
        Y[n_teams+1+4*i+2] = SD_to_goal * (df_dy.SD_Home.iloc[i]) #SD info
        Y[n_teams+1+4*i+3] = xGD_to_goal * (df_dy.xGD_Home.iloc[i]) #xGD info

      if y == 17:
        endDate = curr_date + pd.Timedelta(1, unit='day')
        rating_model = sm.WLS(Y, X, weights=wgts).fit()
        final_ratings = rating_model.params[:n_teams]
        final_ratings -= np.mean(final_ratings)

        for team in teams:
          ratings_dict[team] = final_ratings[team_map[team]] 

      df_dy['R_Home'] = ratings_home
      df_dy['R_Away'] = ratings_away
      data.append(df_dy)
  
  df_ratings = pd.concat(data).sort_values('GameID')
  return df_ratings

In [7]:
mkt_wt = 10 #Weight of market prices
goal_wt = 1 #Weight of goal differentials
SD_wt = 1
xGD_wt = 1

In [8]:
data = []
ratings_dict = {}
df_ratings = dfl_calc(df, df_prior, ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt)
df_ratings.head()

100%|██████████| 5/5 [00:12<00:00,  2.57s/it]


Unnamed: 0,Div,Date,Y,Team_Home,Team_Away,G_Home,G_Away,S_Home,S_Away,pH,pD,pA,xG_Home,xG_Away,GameID,GD_Home,GD_Away,SD_Home,SD_Away,xGD_Home,xGD_Away,R_Home,R_Away
1826,Ligue_1,2015-08-07,15,Lille,Paris SG,0,1,12,7,0.147474,0.259055,0.593471,0.914879,1.33457,1826,-1,1,5,-5,-0.419691,0.419691,0.005618,1.066845
1827,EPL,2015-08-08,15,Everton,Watford,2,2,10,11,0.577077,0.248362,0.174561,0.604226,0.557892,1827,0,0,-1,1,0.046334,-0.046334,-0.056671,-0.50939
1828,EPL,2015-08-08,15,Bournemouth,Aston Villa,0,1,11,7,0.50228,0.268341,0.229378,0.876106,0.782253,1828,-1,1,4,-4,0.093853,-0.093853,-0.50939,-0.610355
1829,Ligue_1,2015-08-08,15,Nice,Monaco,1,2,5,19,0.235607,0.288583,0.47581,0.459874,2.81413,1829,-1,1,-14,14,-2.354256,2.354256,-0.225083,0.559302
1830,Ligue_1,2015-08-08,15,Troyes,Ajaccio GFCO,0,0,12,6,0.480552,0.29924,0.220208,0.394458,1.05977,1830,0,0,6,-6,-0.665312,0.665312,-0.516311,-0.516311


In [9]:
df_1c = df_prior.copy()
df_1c['R_Final'] = df_prior.Team.map(ratings_dict)
df_1c[(df_1c.Y == 17) & (df_1c.Div == 'EPL')].sort_values('R_Final', ascending=False).reset_index(drop=True)

Unnamed: 0,Div,Y,Team,priorGD,R_Final
0,EPL,17,Man City,0.894973,1.452859
1,EPL,17,Liverpool,0.779622,1.002248
2,EPL,17,Tottenham,1.333306,0.940018
3,EPL,17,Chelsea,1.148744,0.794985
4,EPL,17,Man United,0.52585,0.712594
5,EPL,17,Arsenal,0.710411,0.435926
6,EPL,17,Southampton,-0.212395,-0.03573
7,EPL,17,Crystal Palace,-0.350815,-0.061418
8,EPL,17,Leicester,-0.396956,-0.22942
9,EPL,17,Newcastle,-0.549762,-0.249825


# 1d.

In [10]:
df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)
df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away

In [11]:
train, test = df_ratings[df_ratings.Y < 18], df_ratings[df_ratings.Y == 18]

In [12]:
result = smf.logit('homeWin ~ RD_Home + 1', data=train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.610079
         Iterations 5


0,1,2,3
Dep. Variable:,homeWin,No. Observations:,5478.0
Model:,Logit,Df Residuals:,5476.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 10 Mar 2021",Pseudo R-squ.:,0.116
Time:,22:11:15,Log-Likelihood:,-3342.0
converged:,True,LL-Null:,-3780.6
Covariance Type:,nonrobust,LLR p-value:,8.687000000000001e-193

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1823,0.029,-6.194,0.000,-0.240,-0.125
RD_Home,1.0673,0.041,26.020,0.000,0.987,1.148


In [13]:
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

0.2117308896755449

# 1e.

In [14]:
result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.555208
         Iterations 5


0,1,2,3
Dep. Variable:,pH,No. Observations:,5478.0
Model:,Logit,Df Residuals:,5476.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 10 Mar 2021",Pseudo R-squ.:,0.164
Time:,22:11:15,Log-Likelihood:,-3041.4
converged:,True,LL-Null:,-3638.1
Covariance Type:,nonrobust,LLR p-value:,1.6489999999999999e-261

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2217,0.029,-7.537,0.000,-0.279,-0.164
RD_Home,1.0465,0.041,25.704,0.000,0.967,1.126


In [15]:
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

0.21151645081406056

# 1f.

In [16]:
mkt_wt = [10, 12, 15] #Weight of market prices
goal_wt = [1.0, 1.2, 1.4] #Weight of goal differentials
SD_wt = [1.0, 1.2, 1.4]
xGD_wt = [1.0, 1.2, 1.4]

brier_scores = {}

In [None]:
data = []
ratings_dict = {}
for item in itertools.product(mkt_wt, goal_wt, SD_wt, xGD_wt):
  MKT, GD, SD, xGD = item
  df_ratings = dfl_calc(df, df_prior, ratings_dict, MKT, GD, SD, xGD)

  df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)
  df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away
  train, test = df_ratings[df_ratings.Y < 18], df_ratings[df_ratings.Y == 18]

  result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
  y_pred = result.predict(test)
  brier_scores[item] = brier_score_loss(test['homeWin'], y_pred)



In [18]:
best_wts = min(brier_scores, key=brier_scores.get)
print(min(brier_scores.values()))
best_wts

0.2114973374302251


(15, 1.4, 1.4, 1.4)

## iv.

In [19]:
df_long = pd.wide_to_long(df, ['Team', 'G', 'S', 'xG', 'GD', 'SD', 'xGD'], 'GameID', 'isHome', '_', r'\w+').reset_index()
df_long['isHome'] =  1*(df_long['isHome'] == 'Home')

In [20]:
df_output = df_long[df_long.Y < 18].groupby(['Y', 'Team'])['GD'].mean().reset_index()
df_output['GD_Prev'] = np.NaN

for i in range(len(df_output)):
  team = df_output.loc[i, 'Team']
  year = df_output.loc[i, 'Y']

  GD_Prev = df_output.loc[(df_output.Team == team) & (df_output.Y == year - 1)]['GD'].values 
  if len(GD_Prev) > 0:
    df_output.loc[i, 'GD_Prev'] = GD_Prev

promoted_prior = df_output[(df_output.Y.between(15, 18)) & (df_output.isna().any(axis=1))].GD.mean()

In [21]:
df_output = df_long.groupby(['Y', 'Team'])['GD'].mean().reset_index()
df_output['GD_Prev'] = np.NaN

for i in range(len(df_output)):
  team = df_output.loc[i, 'Team']
  year = df_output.loc[i, 'Y']

  GD_Prev = df_output.loc[(df_output.Team == team) & (df_output.Y == year - 1)]['GD'].values 
  if len(GD_Prev) > 0:
    df_output.loc[i, 'GD_Prev'] = GD_Prev

df_output = df_output.merge(df_long[['Y', 'Div', 'Team']], on=['Y', 'Team']).drop_duplicates()

In [22]:
train = df_output[df_output.Y < 18].dropna()
result = smf.glm('GD ~ GD_Prev', data=train).fit()

npt = df_output[df_output.Y > 14].dropna()
y_pred = result.predict(npt)
npt['newPrior'] = y_pred

In [23]:
df_prior2 = df_output[df_output.Y > 14].merge(npt[['Y', 'Team', 'newPrior']], how='left').fillna(promoted_prior)
df_prior2['priorGD'] = df_prior2.newPrior - df_prior2.groupby(['Y', 'Div']).newPrior.transform('mean')

## iii.

In [24]:
mkt_wt, goal_wt, SD_wt, xGD_wt = best_wts
df_ratings = dfl_calc(df, df_prior, ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt, y_start=15, y_end=19)
df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away
df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)

100%|██████████| 5/5 [00:14<00:00,  3.00s/it]


In [26]:
train, test = df_ratings[df_ratings.Y < 18], df_ratings[df_ratings.Y == 18]

result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

Optimization terminated successfully.
         Current function value: 0.555222
         Iterations 5


0.21149671499649395

## iv.

In [27]:
df_ratings = dfl_calc(df, df_prior2[df_prior2.Y < 20], ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt, y_start=15, y_end=20)
df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away
df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)


100%|██████████| 5/5 [00:16<00:00,  3.39s/it]


In [28]:
df_ratings_pre = df_ratings[df_ratings.Date < '2020-05-16']
df_ratings_cov = df_ratings[df_ratings.Date >= '2020-05-16']

train, test = df_ratings_pre[df_ratings_pre.Y < 19], df_ratings_pre[df_ratings_pre.Y == 19]

result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

Optimization terminated successfully.
         Current function value: 0.555513
         Iterations 5


0.22166854887908394

# 1g.

In [29]:
df_ratings = dfl_calc(df, df_prior2, ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt, y_start=15, y_end=21)
df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away
df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)

train, test = df_ratings[df_ratings.Y < 20], df_ratings[df_ratings.Y == 20]

result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

100%|██████████| 5/5 [00:16<00:00,  3.35s/it]


Optimization terminated successfully.
         Current function value: 0.555566
         Iterations 5


0.20883878390720106

In [30]:
hfa_cov_prior = df_ratings_cov[df_ratings_cov.Y == 19].GD_Home.mean()

df_ratings = dfl_calc(df, df_prior2, ratings_dict, mkt_wt, goal_wt, SD_wt, xGD_wt, y_start=15, y_end=21, hfa_prior=hfa_cov_prior)
df_ratings['RD_Home'] = df_ratings.R_Home - df_ratings.R_Away
df_ratings['homeWin'] = 1*(df_ratings.GD_Home > 0)

train, test = df_ratings[df_ratings.Y < 20], df_ratings[df_ratings.Y == 20]

result = smf.logit('pH ~ RD_Home + 1', data=train).fit()
y_pred = result.predict(test)
brier_score_loss(test['homeWin'], y_pred)

100%|██████████| 5/5 [00:16<00:00,  3.39s/it]


Optimization terminated successfully.
         Current function value: 0.555579
         Iterations 5


0.20834603200007581