# Ratings & Market-Implied Probabilities
Amanda Kuznecov (anr431)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.stats import poisson
import random
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import brier_score_loss
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [2]:
#read in data
df = pd.read_csv('soccer18m.csv', index_col = False, 
                 parse_dates = ['Date'])

#sort by date to update ratings chronologically
df = df.sort_values('Date')

#reset index
df = df.reset_index(drop=True)

#include Game ID to differentiate games
df.loc[:,'Game_ID'] = df.index

#rename columns
df = df.rename({'HomeTeam': 'Team_Home','AwayTeam': 'Team_Away',
                'FTHG': 'FTG_Home','FTAG': 'FTG_Away',
                'HTHG': 'HTG_Home','HTAG': 'HTG_Away',
               'HS': 'S_Home','AS': 'S_Away', 'HST':'ST_Home',
               'AST':'ST_Away','home_xG':'xG_Home',
                'away_xG':'xG_Away'}, axis=1)

#### Add features from Hwk2 model:
 - Historical Average Goal Differential
 - Historical Average Expected Goal Differential
 - Historical Average Shots on Target Differential
 - Winning Streak

In [3]:
#create y (home team wins)
y = np.where(df.FTG_Home > df.FTG_Away,1,0)
df.loc[:,'y'] = y

In [4]:
#AVG GOAL DIFF and NUM_GAMES (modified to be seasonal num_games)

#create copy to analyze goal differentials
df_gd = df.copy()

#calculate goal differentials for home and away teams of each match
df_gd.loc[:,'GD_Home'] = df_gd.loc[:,'FTG_Home']-df_gd.loc[:,'FTG_Away']
df_gd.loc[:,'GD_Away'] = df_gd.loc[:,'FTG_Away']-df_gd.loc[:,'FTG_Home']

#keep only columns for goal differntial stats
df_gd = df_gd[['Date','Game_ID','Y','Team_Home','Team_Away','GD_Home','GD_Away']]

#pivot table to get team from each game on separate row, and include all goal differentials in one column
df_long = pd.wide_to_long(df_gd, ['Team','GD'], i = ['Game_ID','Y'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID','Y']) 

#number of games previously played per team
df_long.loc[:,'Num_Games'] = df_long.groupby(['Team','Y']).cumcount()

#cumulative avg of goal differential per team for all preceding games
df_long = df_long.assign(GDcumAvg = df_long.groupby('Team')['GD'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))

#get home and away goal diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = ['GDcumAvg','Num_Games'])

#rename columns and drop hierarchical level
df_pivot.columns = pd.MultiIndex.from_tuples([
    ('GDcumAvg', 'GDcumAvg_Away'), ('GDcumAvg', 'GDcumAvg_Home'), ('Num_Games', 'Num_Games_Away'),('Num_Games', 'Num_Games_Home')
])

#remove top level of hierarchical column naming
df_pivot = df_pivot.droplevel(0,axis=1)

#change num games type to int
df_pivot.loc[:,'Num_Games_Home'] = df_pivot.loc[:,'Num_Games_Home'].astype(int)
df_pivot.loc[:,'Num_Games_Away'] = df_pivot.loc[:,'Num_Games_Away'].astype(int)

#merge goal diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [5]:
#AVG EXPECTED GOALS DIFF

df = df.sort_values(['Date','Game_ID'])

#create copy to analyze goal differentials
df_xG = df.copy()

#calculate difference in expected goals for home and away teams of each match
df_xG.loc[:,'xGDiff_Home'] = df_xG.loc[:,'xG_Home']-df_xG.loc[:,'xG_Away']
df_xG.loc[:,'xGDiff_Away'] = df_xG.loc[:,'xG_Away']-df_xG.loc[:,'xG_Home']

#keep only columns for difference in expected goals
df_xG = df_xG[['Date','Game_ID','Team_Home','Team_Away','xGDiff_Home','xGDiff_Away']]

#pivot table to get team from each game on separate row, and include all difference in xG in one column
df_long = pd.wide_to_long(df_xG, ['Team','xGDiff'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#cumulative avg of goal differential per team for all preceding games
df_long = df_long.assign(xGDiff_cumAvg = df_long.groupby('Team')['xGDiff'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))

#get home and away goal diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'xGDiff_cumAvg')

df_pivot = df_pivot.rename(columns = {'Home':'xGDiff_cumAvg_Home','Away':'xGDiff_cumAvg_Away'})

#merge goal diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [6]:
#SHOTS ON TARGET

#create copy to analyze shot on target differentials
df_st = df.copy()

#calculate goal differentials for home and away teams of each match
df_st.loc[:,'STDiff_Home'] = df_st.loc[:,'ST_Home']-df_st.loc[:,'ST_Away']
df_st.loc[:,'STDiff_Away'] = df_st.loc[:,'ST_Away']-df_st.loc[:,'ST_Home']

#keep only columns for goal differntial stats
df_st = df_st[['Date','Game_ID','Team_Home','Team_Away','STDiff_Home','STDiff_Away']]

#pivot table to get team from each game on separate row, and include all shot on target differentials in one column
df_long = pd.wide_to_long(df_st, ['Team','STDiff'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#cumulative avg of shot on target differential per team for all preceding games
df_long = df_long.assign(STDiffcumAvg = df_long.groupby('Team')['STDiff'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))


#get home and away shots on target diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'STDiffcumAvg')

df_pivot = df_pivot.rename(columns = {'Home':'STDiff_cumAvg_Home','Away':'STDiff_cumAvg_Away'})

#merge shots on target diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [7]:
# WIN / LOSE STREAK

def home(v):
    if v.FTG_Home > v.FTG_Away:
        v = 1
    else:
        v = 0 
    return v

df['Win_Home'] = df.apply(home, axis = 1)
df['Win_Away'] = 1-df['Win_Home']

#create copy of dataframe to analyze team wins
df_wins = df.copy()

#keep columns to calculate winning percentage
df_wins = df_wins[['Date','Game_ID','Y','Team_Home','Team_Away','Num_Games_Home','Num_Games_Away','Win_Home','Win_Away']]

#pivot table to get team from each game on separate row, and include all win flags in one column
df_long = pd.wide_to_long(df_wins, ['Team','Num_Games','Win'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#sort by team first
df_long = df_long.sort_values(['Team','Date','Game_ID'])

#create streak grouped by wins or losses per team
df_long['Streak'] = df_long['Win'].groupby((df_long['Win']!=df_long.groupby(['Team','Y'])['Win'].shift()).cumsum()).cumcount()+1

#shift binary wins column down and set win to 0 when team has not played yet
df_long['Win_shift'] = df_long['Win'].shift(1,fill_value=0)
df_long['Win_shift'] = np.where(df_long.Num_Games == 0, 0, df_long.Win_shift)

#shift streaks column down and set streak to 0 when team has not played yet
df_long['Streak_shift'] = df_long['Streak'].shift(1,fill_value=0)
df_long['Streak_shift'] = np.where(df_long.Num_Games == 0, 0, df_long.Streak_shift)

#win streak based on num wins in a row
df_long['Win_streak'] = df_long['Win_shift']*df_long['Streak_shift']
df_long['Win_streak'] = np.where(df_long.Num_Games == 0, 0, df_long.Win_streak)

#create loss shift (opposite of win shift)
df_long['Loss_shift'] = 1-df_long['Win_shift']
df_long['Loss_shift'] = np.where(df_long.Num_Games == 0, 0, df_long.Loss_shift)

#loss streak based on num losses in a row
df_long['Loss_streak'] = df_long['Loss_shift']*df_long['Streak_shift']

#drop columns not needed
df_long = df_long.drop(columns = ['Num_Games','Win','Streak','Win_shift','Streak_shift','Loss_shift'])

#get home and away streaks for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = ['Win_streak','Loss_streak'])

#rename columns and drop hierarchical level
df_pivot.columns = pd.MultiIndex.from_tuples([
    ('Win_streak', 'Win_streak_Away'), ('Win_streak', 'Win_streak_Home'), 
    ('Loss_streak', 'Loss_streak_Away'),('Loss_streak', 'Loss_streak_Home')
])

#remove top level of hierarchical column naming
df_pivot = df_pivot.droplevel(0,axis=1)

#merge streaks with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

## Question 1: Elo Ratings
### Part a

In [8]:
def elo(abs_GD, W, eloF, eloA, isHome, K=40, HFA = 100):
    G = 1 if abs_GD <= 1 else(1.5 if abs_GD == 2 else (11+abs_GD)/8)
    dr = (eloF - eloA) + (HFA if isHome else -HFA)
    We = 1/(1+10**(-dr/400))
    return eloF + K*G*(W-We)

In [9]:
#create column for absolute goal differential of game
df.loc[:,'abs_GD'] = abs(df.FTG_Home - df.FTG_Away)

In [10]:
def home_result(df):
    '''This function sets points for win, draw, loss outcome of game'''
    if df.FTG_Home > df.FTG_Away:
        val = 1
    elif df.FTG_Home < df.FTG_Away:
        val = 0
    else:
        val = 0.5
    return val

In [11]:
#create W per team
df.loc[:,'W_Home'] = df.apply(home_result, axis = 1)
df.loc[:,'W_Away'] = 1-df.W_Home

In [12]:
df_sm = df[['Date','Game_ID','Y','Div','Team_Home','Team_Away','W_Home','W_Away','abs_GD','Num_Games_Home','Num_Games_Away']]

#pivot table to get team from each game on separate row, and include all ratings in one column
df_long = pd.wide_to_long(df_sm, ['Team','W','Num_Games'], i = ['Game_ID','abs_GD','Y','Div'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID','Y','Div']) 

df_long.loc[:,'elo'] = 1000

In [13]:
for i in tqdm(range(len(df))):
    
    Team_Home = df_long.loc[(df_long['Game_ID'] == i) & (df_long.isHome == 'Home')].Team.item()
    Team_Away = df_long.loc[(df_long['Game_ID'] == i) & (df_long.isHome == 'Away')].Team.item()
    
    abs_GD = df_long.loc[(df_long.Game_ID == i)][:1].abs_GD.item()
    
    W_Home = df_long.loc[(df_long['Game_ID'] == i) & (df_long.Team == Team_Home)].W.item()
    W_Away = df_long.loc[(df_long['Game_ID'] == i) & (df_long.Team == Team_Away)].W.item()
    
    elo_Home_old = df_long.loc[(df_long['Game_ID'] == i) & (df_long.Team == Team_Home)].elo.item()
    elo_Away_old = df_long.loc[(df_long['Game_ID'] == i) & (df_long.Team == Team_Away)].elo.item()
    
    elo_Home = elo(abs_GD, W_Home, elo_Home_old, elo_Away_old,1)
    elo_Away = elo(abs_GD, W_Away, elo_Away_old, elo_Home_old,0)
    
    Home_updates = ((df_long.Team == Team_Home) & (df_long.Game_ID >= i))
    df_long.loc[Home_updates, 'elo'] = elo_Home
    
    Away_updates = ((df_long.Team == Team_Away) & (df_long.Game_ID >= i))
    df_long.loc[Away_updates, 'elo'] = elo_Away


100%|██████████| 9130/9130 [04:27<00:00, 34.10it/s]


In [14]:
divisions = df_long.loc[:,'Div'].unique()
top_teams = pd.DataFrame()

#get top of table for each division at end of 2017 season
for div in divisions:
    
    #find last games for each team in div
    max_games = df_long.loc[df_long.Div == div]['Num_Games'].max()
    
    #filter table to include last games for each team from specific div for 2017 season
    df_scratch = df_long.loc[(df_long.Div == div) & (df_long.Y == 17) &(df_long.Num_Games == max_games)]
    df_scratch = df_scratch.sort_values('elo', ascending = False)
    df_scratch = df_scratch[:3]
    
    top_teams = pd.concat([top_teams,df_scratch])

In [15]:
top_teams = top_teams.sort_values(['Div','elo'], ascending = (True, False))
top_teams = top_teams[['Div','Team','elo']]
top_teams

Unnamed: 0,Div,Team,elo
14476,Bundesliga,Bayern Munich,1350.621424
14492,Bundesliga,Schalke 04,1159.177933
14498,Bundesliga,Hoffenheim,1142.152808
14527,EPL,Man City,1429.6594
14530,EPL,Tottenham,1283.911192
14522,EPL,Man United,1258.73246
14600,La_Liga,Barcelona,1415.462495
14557,La_Liga,Real Madrid,1306.832652
14598,La_Liga,Ath Madrid,1220.575107
14569,Ligue_1,Paris SG,1352.520538


### Part b

In some instances, it may be a good idea to temporarily use a higher value of $K$ if we would like to place more weighting on certain games. Some games played throughout the season are of more importance than others; for example, games played at the end of the season will be more important, especially for some teams who are on the cusp of relegation as they rely on winning at the end of the season to stay within the league. Although it is not the case for European football leagues, some American sports participate in playoffs. It could be beneficial to place higher weighting on playoff games or games played at the end of the season to qualify for the playoffs. These games are higher stake than other games during the regular season.

### Part c

In [16]:
#shift all elo down so they become pre-game elo
df_elo = df_long[['Game_ID','Team','isHome','elo']].copy()

#shift all elo ratings down 1 per team and set initial elo to 1000
df_elo.loc[:,'elo'] = df_elo.groupby('Team')['elo'].transform(lambda x: x.shift(1,fill_value = 1000))

#get home and away elo for each game
df_pivot = df_elo.pivot(index='Game_ID', columns = 'isHome',values = 'elo')
df_pivot = df_pivot.rename(columns = {'Home':'elo_Home','Away':'elo_Away'})

#merge elo table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

#create new feature for elo diff
df.loc[:,'elo_Diff'] = df.loc[:,'elo_Home']-df.loc[:,'elo_Away']

In [20]:
#build logit model (before adding elo)

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away']]
X_test = sm.add_constant(X_test)

In [21]:
result = sm.Logit(y_train, X_train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.623577
         Iterations 5


0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7295.0
Method:,MLE,Df Model:,8.0
Date:,"Tue, 23 Feb 2021",Pseudo R-squ.:,0.09584
Time:,10:40:26,Log-Likelihood:,-4554.6
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,3.9589999999999996e-203

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2171,0.034,-6.317,0.000,-0.285,-0.150
GDcumAvg_Home,0.1515,0.093,1.632,0.103,-0.030,0.333
GDcumAvg_Away,-0.2043,0.097,-2.109,0.035,-0.394,-0.014
xGDiff_cumAvg_Home,0.5545,0.136,4.069,0.000,0.287,0.822
xGDiff_cumAvg_Away,-0.5962,0.142,-4.193,0.000,-0.875,-0.318
STDiff_cumAvg_Home,0.0879,0.042,2.068,0.039,0.005,0.171
STDiff_cumAvg_Away,-0.0628,0.043,-1.445,0.149,-0.148,0.022
Win_streak_Home,0.0480,0.015,3.171,0.002,0.018,0.078
Win_streak_Away,-0.0139,0.015,-0.916,0.360,-0.044,0.016


In [22]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.215536478836543

In [26]:
#build logit model (with new elo feature)
train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away',
                'elo_Diff']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away',
              'elo_Diff']]
X_test = sm.add_constant(X_test)

In [27]:
result = sm.Logit(y_train, X_train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.620714
         Iterations 5


0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7294.0
Method:,MLE,Df Model:,9.0
Date:,"Tue, 23 Feb 2021",Pseudo R-squ.:,0.09999
Time:,10:40:54,Log-Likelihood:,-4533.7
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,4.312e-211

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2218,0.035,-6.427,0.000,-0.289,-0.154
GDcumAvg_Home,-0.0539,0.096,-0.562,0.574,-0.242,0.134
GDcumAvg_Away,0.0271,0.102,0.266,0.790,-0.172,0.226
xGDiff_cumAvg_Home,0.4758,0.134,3.542,0.000,0.213,0.739
xGDiff_cumAvg_Away,-0.5391,0.140,-3.845,0.000,-0.814,-0.264
STDiff_cumAvg_Home,0.0785,0.042,1.875,0.061,-0.004,0.160
STDiff_cumAvg_Away,-0.0566,0.043,-1.320,0.187,-0.141,0.027
Win_streak_Home,0.0273,0.015,1.763,0.078,-0.003,0.058
Win_streak_Away,0.0066,0.015,0.428,0.669,-0.024,0.037


In [28]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.21415653455273995

## Question 2: Market Implied Probabilities
### Part a

In [29]:
#limit to all seasons before 2018
df_pre18 = df.loc[df.Y<18]

In [30]:
#filter if underdog won
df_underdog = df_pre18.loc[((df_pre18.FTG_Home - df_pre18.FTG_Away > 0) & (df_pre18.pH < df_pre18.pA))|((df_pre18.FTG_Away - df_pre18.FTG_Home > 0) & (df_pre18.pA < df_pre18.pH))]

#create new column for p of underdog
df_underdog = df_underdog.assign(p_underdog=lambda x: x[['pH', 'pA']].min(1))
df_underdog = df_underdog.sort_values('p_underdog')
df_out = df_underdog[['Div','Y','Team_Home','Team_Away','pH','pA','FTG_Home','FTG_Away']][:7]
df_out

Unnamed: 0,Div,Y,Team_Home,Team_Away,pH,pA,FTG_Home,FTG_Away
3779,La_Liga,16,Barcelona,Alaves,0.891147,0.028831,1,2
1169,La_Liga,14,Barcelona,Malaga,0.875453,0.040021,0,1
465,La_Liga,14,Barcelona,Celta,0.861781,0.043664,0,1
3133,Bundesliga,15,Bayern Munich,Mainz,0.85692,0.044404,1,2
5714,La_Liga,17,Real Madrid,Betis,0.876513,0.048646,0,1
3593,La_Liga,15,Levante,Ath Madrid,0.052018,0.798875,2,1
1381,Bundesliga,14,Bayern Munich,M'gladbach,0.821218,0.054292,0,2


### Part b

In [31]:
#filter for teams having played less than 5 games in the season
df_less5 = df_pre18.loc[(df_pre18.Num_Games_Away < 5) & (df_pre18.Num_Games_Home <5)]

In [32]:
#Brier score when each team has fewer than 5 games played that season
brier_score_loss(df_less5.y, df_less5.pH)

0.21058076780385268

In [33]:
#Brier score on all games
brier_score_loss(df_pre18.y, df_pre18.pH)

0.2106061594649744

### Part c

In [34]:
#rename columns for easy pivot
df = df.rename({'pH': 'p_Home','pA': 'p_Away',}, axis=1)

#use columns needed for p (prob of team winning)
df_p = df[['Date','Game_ID','Y','Team_Home','Team_Away','p_Home','p_Away']]

In [35]:
#pivot table to get team from each game on separate row, and include all p in one column
df_long = pd.wide_to_long(df_p, ['Team','p'], i = ['Game_ID','Y'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID','Y'])

#cumulative avg of prob of win per team for all preceding games of the season
df_long = df_long.assign(p_cumAvg = df_long.groupby(['Team','Y'])['p'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0.5)))


In [37]:
#get home and away p cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'p_cumAvg')
df_pivot = df_pivot.rename(columns = {'Home':'p_cumAvg_Home','Away':'p_cumAvg_Away'})

#merge p cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [38]:
#build logit model (with new p feature)

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away',
                'p_cumAvg_Home','p_cumAvg_Away']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away',
              'p_cumAvg_Home','p_cumAvg_Away']]
X_test = sm.add_constant(X_test)

In [39]:
result = sm.Logit(y_train, X_train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.618720
         Iterations 5


0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7293.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 23 Feb 2021",Pseudo R-squ.:,0.1029
Time:,10:41:32,Log-Likelihood:,-4519.1
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,2.488e-216

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2900,0.211,-1.378,0.168,-0.703,0.123
GDcumAvg_Home,0.0499,0.094,0.529,0.597,-0.135,0.235
GDcumAvg_Away,-0.0888,0.099,-0.900,0.368,-0.282,0.105
xGDiff_cumAvg_Home,0.2952,0.142,2.083,0.037,0.017,0.573
xGDiff_cumAvg_Away,-0.3751,0.146,-2.562,0.010,-0.662,-0.088
STDiff_cumAvg_Home,0.0157,0.044,0.356,0.722,-0.071,0.102
STDiff_cumAvg_Away,-0.0016,0.045,-0.037,0.971,-0.090,0.086
Win_streak_Home,0.0405,0.015,2.642,0.008,0.010,0.070
Win_streak_Away,-0.0089,0.015,-0.580,0.562,-0.039,0.021


In [40]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.214648014979171