In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import brier_score_loss
from scipy.special import logit, expit
from tqdm import tqdm
pd.options.display.max_columns = 100

In [2]:
df = pd.read_csv('hw4.csv', parse_dates = ['Date'])
df = df.sort_values('GameID')
df_prior = pd.read_csv('hw4_prior.csv')

In [3]:
df.head()

Unnamed: 0,Div,Date,Y,Team_Home,Team_Away,G_Home,G_Away,S_Home,S_Away,pH,pD,pA,xG_Home,xG_Away,GameID,GD_Home,GD_Away,SD_Home,SD_Away,xGD_Home,xGD_Away
0,Ligue_1,2014-08-08,14,Reims,Paris SG,2,2,9,16,0.089841,0.196675,0.713484,1.36787,2.65538,0,0,0,-7,7,-1.28751,1.28751
1,Ligue_1,2014-08-09,14,Montpellier,Bordeaux,0,1,15,7,0.386037,0.317102,0.296861,1.01756,0.750184,1,-1,1,8,-8,0.267376,-0.267376
2,Ligue_1,2014-08-09,14,Lille,Metz,0,0,14,2,0.636756,0.235893,0.127351,1.54468,0.057137,2,0,0,12,-12,1.487543,-1.487543
3,Ligue_1,2014-08-09,14,Guingamp,St Etienne,0,2,6,7,0.282701,0.312602,0.404697,0.63294,1.49532,3,-2,2,-1,1,-0.86238,0.86238
4,Ligue_1,2014-08-09,14,Evian Thonon Gaillard,Caen,0,3,10,12,0.399734,0.297363,0.302904,0.813737,1.23869,4,-3,3,-2,2,-0.424953,0.424953


In [4]:
df_prior.head()

Unnamed: 0,Div,Y,Team,priorGD
0,Bundesliga,15,Augsburg,-0.009032
1,Bundesliga,15,Bayern Munich,1.589594
2,Bundesliga,15,Darmstadt,-0.507891
3,Bundesliga,15,Dortmund,0.119889
4,Bundesliga,15,Ein Frankfurt,-0.163738


In [5]:
hfa_prior = 0.3739 #Prior on home field advantage for goal differential
lmp_to_goal = 0.489739 #Conversion factor from differences of logit market probs to goals
team_prior_wt = 15 #Weight on team priors from previous season
hfa_prior_wt = 500 #Weight on hfa prior (strong)
wt_decay = 0.97 #Decay of weights per day, approx 0.81 per week
mkt_wt = 10 #Weight of market prices
goal_wt = 1 #Weight of goal differentials

In [6]:
data = []
for div in tqdm(sorted(df.Div.unique())) : #Process each division
    for y in sorted(df.loc[df.Y.between(15,18)].Y.unique()) : #Process each year separately
        df_dy = df.loc[(df.Y == y) & (df.Div == div)].copy()
        n_games = len(df_dy)
        teams = sorted(set(df_dy.Team_Home.unique()) | set(df_dy.Team_Away.unique())) #All teams
        n_teams = len(teams)
        team_map = {k:i for i,k in enumerate(teams)} #team -> index
        
        n_rows = 2*n_games + n_teams + 1 #n_teams team priors, 1 hfa prior, 2 rows per game
        X = np.zeros((n_rows, n_teams + 1)) 
        Y = np.zeros(n_rows) 
        wgts = goal_wt*np.ones(n_rows) #by default, set all weights to goal_wt
        
        #Setup priors
        X[:n_teams+1,:n_teams+1] = np.eye(n_teams+1) #rows for priors
        df_prior_dy = df_prior.loc[(df_prior.Y == y) & (df_prior.Div == div)].copy()
        #Setup team priors
        wgts[:n_teams] = team_prior_wt
        for i in range(len(df_prior_dy)) :
            team = df_prior_dy.Team.iloc[i]
            prior = df_prior_dy.priorGD.iloc[i]
            Y[team_map[team]] = prior
        #Setup hfa prior
        Y[n_teams] = hfa_prior
        wgts[n_teams] = hfa_prior_wt
        
        #Game rows are an alternating pattern of goal differentials, and market prices
        wgts[n_teams+2::2] = mkt_wt
        
        #Starting ratings are just priors
        ratings = Y[:n_teams]

        prev_date = None
        ratings_home = np.empty(n_games)
        ratings_away = np.empty(n_games)
        
        #Process every game
        for i in range(n_games) : #dataframe is sorted chronologically
            curr_date = df_dy.Date.iloc[i]
            #Refit on all strictly earlier games if first game of new date
            if prev_date is not None and curr_date > prev_date :
                rating_model = sm.WLS(Y, X, weights = wgts).fit()
                ratings = rating_model.params[:n_teams]
                ratings = ratings - np.mean(ratings) #Center ratings around 0            
                #Decay weights of all games and priors using elapsed days
                wgts[:n_teams+1+2*i] *= wt_decay**((curr_date-prev_date)/pd.Timedelta(1,unit='day'))
            prev_date = curr_date
            
            home, away = df_dy.Team_Home.iloc[i], df_dy.Team_Away.iloc[i]
            i_home, i_away = team_map[home], team_map[away]
            ratings_home[i] = ratings[i_home]
            ratings_away[i] = ratings[i_away]
            
            X[n_teams+1+2*i:n_teams+1+2*i+2] = 1.0*(np.arange(n_teams+1)==i_home)-1.0*(np.arange(n_teams+1)==i_away)
            X[n_teams+1+2*i:n_teams+1+2*i+2, -1] = 1.0 #HFA
            Y[n_teams+1+2*i] = df_dy.GD_Home.iloc[i] #Goal differential
            Y[n_teams+1+2*i+1] = lmp_to_goal * (logit(df_dy.pH.iloc[i])-logit(df_dy.pA.iloc[i])) #market info
        
        df_dy['R_Home'] = ratings_home
        df_dy['R_Away'] = ratings_away
        data.append(df_dy)

df_ratings = pd.concat(data).sort_values('GameID')
df_ratings.head()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.77it/s]


Unnamed: 0,Div,Date,Y,Team_Home,Team_Away,G_Home,G_Away,S_Home,S_Away,pH,pD,pA,xG_Home,xG_Away,GameID,GD_Home,GD_Away,SD_Home,SD_Away,xGD_Home,xGD_Away,R_Home,R_Away
1826,Ligue_1,2015-08-07,15,Lille,Paris SG,0,1,12,7,0.147474,0.259055,0.593471,0.914879,1.33457,1826,-1,1,5,-5,-0.419691,0.419691,0.005618,1.066845
1827,EPL,2015-08-08,15,Everton,Watford,2,2,10,11,0.577077,0.248362,0.174561,0.604226,0.557892,1827,0,0,-1,1,0.046334,-0.046334,-0.056671,-0.50939
1828,EPL,2015-08-08,15,Bournemouth,Aston Villa,0,1,11,7,0.50228,0.268341,0.229378,0.876106,0.782253,1828,-1,1,4,-4,0.093853,-0.093853,-0.50939,-0.610355
1829,Ligue_1,2015-08-08,15,Nice,Monaco,1,2,5,19,0.235607,0.288583,0.47581,0.459874,2.81413,1829,-1,1,-14,14,-2.354256,2.354256,-0.225083,0.559302
1830,Ligue_1,2015-08-08,15,Troyes,Ajaccio GFCO,0,0,12,6,0.480552,0.29924,0.220208,0.394458,1.05977,1830,0,0,6,-6,-0.665312,0.665312,-0.516311,-0.516311


In [7]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
779,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0


In [8]:
pd.DataFrame(Y)

Unnamed: 0,0
0,0.345903
1,-0.346201
2,-0.715324
3,-0.599973
4,-0.568219
...,...
776,1.966530
777,2.000000
778,-0.551900
779,2.000000


In [9]:
df_ratings['HomeWin'] = 1.0*(df_ratings.G_Home > df_ratings.G_Away)
df57 = df_ratings.loc[df_ratings.Y.between(15,17)]
df8 = df_ratings.loc[df_ratings.Y == 18]
logit_model = smf.logit('HomeWin ~ I(R_Home-R_Away)', df57).fit()
print('logit:',brier_score_loss(df8.HomeWin, logit_model.predict(df8)))

Optimization terminated successfully.
         Current function value: 0.610744
         Iterations 5
logit: 0.2118995831126876
