# Homework 2: Soccer Mini-Project
Amanda Kuznecov (anr431)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.stats import poisson
import random
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import brier_score_loss

In [2]:
#read in data
df = pd.read_csv('soccer18.csv', index_col = False, 
                 parse_dates = ['Date'])

#include Game ID to differentiate games
df.loc[:,'Game_ID'] = df.index

#rename columns
df = df.rename({'HomeTeam': 'Team_Home','AwayTeam': 'Team_Away',
                'FTHG': 'FTG_Home','FTAG': 'FTG_Away',
                'HTHG': 'HTG_Home','HTAG': 'HTG_Away',
               'HS': 'S_Home','AS': 'S_Away', 'HST':'ST_Home',
               'AST':'ST_Away','home_xG':'xG_Home',
                'away_xG':'xG_Away'}, axis=1)

#sort by date and game ID
df = df.sort_values(['Date','Game_ID'])

## Question 1: Average Goal Differentials

### Part 1a

In [3]:
#create copy to analyze goal differentials
df_gd = df.copy()

#calculate goal differentials for home and away teams of each match
df_gd.loc[:,'GD_Home'] = df_gd.loc[:,'FTG_Home']-df_gd.loc[:,'FTG_Away']
df_gd.loc[:,'GD_Away'] = df_gd.loc[:,'FTG_Away']-df_gd.loc[:,'FTG_Home']

#keep only columns for goal differntial stats
df_gd = df_gd[['Date','Game_ID','Team_Home','Team_Away','GD_Home','GD_Away']]

In [4]:
#pivot table to get team from each game on separate row, and include all goal differentials in one column
df_long = pd.wide_to_long(df_gd, ['Team','GD'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#cumulative avg of goal differential per team for all preceding games
df_long = df_long.assign(GDcumAvg = df_long.groupby('Team')['GD'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))

#number of games previously played per team
df_long.loc[:,'Num_Games'] = df_long.groupby('Team').cumcount()

In [5]:
#get home and away goal diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = ['GDcumAvg','Num_Games'])

#rename columns and drop hierarchical level
df_pivot.columns = pd.MultiIndex.from_tuples([
    ('GDcumAvg', 'GDcumAvg_Away'), ('GDcumAvg', 'GDcumAvg_Home'), ('Num_Games', 'Num_Games_Away'),('Num_Games', 'Num_Games_Home')
])

#remove top level of hierarchical column naming
df_pivot = df_pivot.droplevel(0,axis=1)

#change num games type to int
df_pivot.loc[:,'Num_Games_Home'] = df_pivot.loc[:,'Num_Games_Home'].astype(int)
df_pivot.loc[:,'Num_Games_Away'] = df_pivot.loc[:,'Num_Games_Away'].astype(int)

In [6]:
#merge goal diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [7]:
#absolute disparity as abs diff between goal diff cumavg for each team
df.loc[:,'Abs_Disparity'] = abs(df.loc[:,'GDcumAvg_Away']-df.loc[:,'GDcumAvg_Home'])

### Part 1a.i

In [8]:
#filter for all games pre 2018
df_pre18 = df.loc[df.Y<18]
df_out = df_pre18[['Div','Y','Team_Home','Team_Away','GDcumAvg_Home','GDcumAvg_Away','Abs_Disparity','Num_Games_Home','Num_Games_Away']]
df_out_i = df_out.sort_values('Abs_Disparity', ascending = False)[:7]
df_out_i

Unnamed: 0,Div,Y,Team_Home,Team_Away,GDcumAvg_Home,GDcumAvg_Away,Abs_Disparity,Num_Games_Home,Num_Games_Away
212,Serie_A,14,Sassuolo,Sampdoria,-3.5,1.0,4.5,2,2
31,Ligue_1,14,Evian Thonon Gaillard,Paris SG,-3.5,1.0,4.5,2,2
5507,Ligue_1,17,Strasbourg,Lille,-4.0,0.078261,4.078261,1,115
145,Serie_A,14,Empoli,Roma,-2.0,2.0,4.0,1,1
101,La_Liga,14,Elche,Granada,-3.0,1.0,4.0,1,1
210,Serie_A,14,Palermo,Inter,-0.5,3.5,4.0,2,2
82,La_Liga,14,Cordoba,Celta,-2.0,2.0,4.0,1,1


### Part 1a.ii

In [9]:
#filter for all games where both teams have already played 100 games each
df_out = df_out.loc[(df_pre18.Num_Games_Home > 100) & (df_pre18.Num_Games_Away>100)]
df_out_ii = df_out.sort_values('Abs_Disparity', ascending = False)[:7]
df_out_ii

Unnamed: 0,Div,Y,Team_Home,Team_Away,GDcumAvg_Home,GDcumAvg_Away,Abs_Disparity,Num_Games_Home,Num_Games_Away
5055,La_Liga,16,Granada,Barcelona,-0.875,2.192308,3.067308,104,104
7265,La_Liga,17,Levante,Barcelona,-0.705357,2.14,2.845357,112,150
5325,La_Liga,16,Granada,Real Madrid,-0.936937,1.9,2.836937,111,110
6762,La_Liga,17,Las Palmas,Barcelona,-0.623762,2.208633,2.832395,101,139
7151,La_Liga,17,La Coruna,Barcelona,-0.621622,2.142857,2.764479,148,147
4962,La_Liga,16,La Coruna,Barcelona,-0.519608,2.22549,2.745098,102,102
6300,La_Liga,17,Barcelona,La Coruna,2.186047,-0.527132,2.713178,129,129


### Part 1a.iii

It is noticeable that all teams (except Lille) who played in the 7 games from Part 1a.i had only previously played 1 or 2 games. Therefore it makes sense as to why the absolute disparities are so large, because if a team played extremely poorly or extremely well in their first couple games, their average goal differential would be "artificially" high or low since it is only based on a single game. In the case of the 2017 match involving Lille vs. Strasbourg, Lille has already played 115 games in Ligue 1, however, this match happens to be Strasbourg's second match in this division. It seems they had been promoted up from Ligue 2 at the end of the 2016 season. Although Strasbourg has historical information about goal differentials, it is not stored in this dataset since it is missing Ligue 2 data. Since Strasbourg lost by 4 in their first match, their average goal differential is artificially low and biased based on their previous game. Therefore, the comparison between the average goal differential for Lille and Strasbourg is very large.

### Part 1b

In [10]:
#build logit model using only intercept term

#train set
y_train = np.where(df_pre18.FTG_Home > df_pre18.FTG_Away, 1, 0)
X_train = np.ones(len(y_train))

#test set
test = df.loc[df.Y == 18]
y_test = np.where(test.FTG_Home > test.FTG_Away,1,0)
X_test = np.ones(len(y_test))

In [11]:
#fit logit on train data
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.689679
         Iterations 3


In [12]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7303.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,4.042e-12
Time:,00:50:07,Log-Likelihood:,-5037.4
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1669,0.023,-7.106,0.000,-0.213,-0.121


In [13]:
#make predictions for 2018 season
y_pred = result.predict(X_test)

#brier score to compare predicted outcomes for 2018 vs actual
brier_score_loss(y_test, y_pred)

0.2473559477379797

### Part 1c

In [14]:
home_wins = len(np.where(y_train ==1)[0])
draws = len(np.where(df_pre18.FTG_Home == df_pre18.FTG_Away)[0])
away_wins = len(np.where(df_pre18.FTG_Home < df_pre18.FTG_Away)[0])
total_games = len(df_pre18)

In [15]:
prob_home = expit(-.1669)
prob_away = away_wins/total_games
prob_draw = draws/total_games
print('Home Team wins: '+str(prob_home))
print('Away Team wins: '+str(prob_away))
print('Draw: '+str(prob_draw))

Home Team wins: 0.4583715872365509
Away Team wins: 0.29312705366922237
Draw: 0.24849397590361447


In [16]:
print('Home Field Advantage: '+str(round(prob_home/prob_away,2)))

Home Field Advantage: 1.56


Once we take the log odds of the coefficient, we can see that the probability of the home team winning is 45.8%, leaving the remaining 54% accounting for the probability of away team winning and probability of a draw occurring. The probability of the away team winning is 29.3%, meaning that the home team wins 1.56 times more than the away team, indicating that there is in fact home field advantage.

### Part 1d

In [17]:
#build logit model using historical avg goal differentials + intercept term
#update feature lists in train and test set

#build train set with features
X_train_new = df_pre18[['GDcumAvg_Home','GDcumAvg_Away']]
X_train_new = sm.add_constant(X_train_new)

#build test set with features
X_test_new = test[['GDcumAvg_Home','GDcumAvg_Away']]
X_test_new = sm.add_constant(X_test_new)

  return ptp(axis=axis, out=out, **kwargs)


In [18]:
#fit logit on train model with new features
result_new = sm.Logit(y_train, X_train_new).fit()

Optimization terminated successfully.
         Current function value: 0.630677
         Iterations 5


In [19]:
result_new.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7301.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.08555
Time:,00:50:07,Log-Likelihood:,-4606.5
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,6.932999999999999e-188

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1791,0.025,-7.183,0.000,-0.228,-0.130
GDcumAvg_Home,0.7853,0.039,20.128,0.000,0.709,0.862
GDcumAvg_Away,-0.7619,0.040,-19.082,0.000,-0.840,-0.684


In [20]:
#make predictions for 2018 season
y_pred_new = result_new.predict(X_test_new)
#brier score to compare predicted outcomes for 2018 vs actual
brier_score_loss(y_test, y_pred_new)

0.2172610107529878

## Question 2: Model Building

### (i) Historical Average of Expected Goals Differential

In [21]:
#create y (home team wins)
y = np.where(df.FTG_Home > df.FTG_Away,1,0)
df.loc[:,'y'] = y

In [22]:
df = df.sort_values(['Date','Game_ID'])

#create copy to analyze goal differentials
df_xG = df.copy()

In [23]:
#calculate difference in expected goals for home and away teams of each match
df_xG.loc[:,'xGDiff_Home'] = df_xG.loc[:,'xG_Home']-df_xG.loc[:,'xG_Away']
df_xG.loc[:,'xGDiff_Away'] = df_xG.loc[:,'xG_Away']-df_xG.loc[:,'xG_Home']

In [24]:
#keep only columns for difference in expected goals
df_xG = df_xG[['Date','Game_ID','Team_Home','Team_Away','xGDiff_Home','xGDiff_Away']]

In [25]:
#pivot table to get team from each game on separate row, and include all difference in xG in one column
df_long = pd.wide_to_long(df_xG, ['Team','xGDiff'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#cumulative avg of goal differential per team for all preceding games
df_long = df_long.assign(xGDiff_cumAvg = df_long.groupby('Team')['xGDiff'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))


In [26]:
#get home and away goal diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'xGDiff_cumAvg')

df_pivot = df_pivot.rename(columns = {'Home':'xGDiff_cumAvg_Home','Away':'xGDiff_cumAvg_Away'})

In [27]:
#merge goal diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [28]:
#build logit model

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

#train set
y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away']]
X_train = sm.add_constant(X_train)

#test set
test = df.loc[df.Y == 18]
y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away']]
X_test = sm.add_constant(X_test)

  return ptp(axis=axis, out=out, **kwargs)


In [29]:
#fit logit on train data
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.624789
         Iterations 5


In [30]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7299.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.09409
Time:,00:50:08,Log-Likelihood:,-4563.5
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,6.955e-204

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1770,0.025,-7.032,0.000,-0.226,-0.128
GDcumAvg_Home,0.2495,0.088,2.829,0.005,0.077,0.422
GDcumAvg_Away,-0.2537,0.092,-2.755,0.006,-0.434,-0.073
xGDiff_cumAvg_Home,0.7367,0.110,6.678,0.000,0.520,0.953
xGDiff_cumAvg_Away,-0.7129,0.115,-6.184,0.000,-0.939,-0.487


In [31]:
#make predictions for 2018 season
y_pred = result.predict(X_test)

#brier score to compare predicted outcomes for 2018 vs actual
brier_score_loss(y_test, y_pred)

0.21603225494350248

### (ii) Rolling Win Percentage

In [32]:
#function for creating outcome for home team
def home(v):
    if v.FTG_Home > v.FTG_Away:
        v = 1
    else:
        v = 0 
    return v

In [33]:
df['Win_Home'] = df.apply(home, axis = 1)

In [34]:
#function for creating outcome for away team
def away(v):
    if v.FTG_Home < v.FTG_Away:
        v = 1
    else:
        v = 0 
    return v

In [35]:
df['Win_Away'] = df.apply(away, axis = 1)

In [36]:
#create copy of dataframe to analyze team wins
df_wins = df.copy()

#keep columns to calculate winning percentage
df_wins = df_wins[['Date','Game_ID','Team_Home','Team_Away','Num_Games_Home','Num_Games_Away','Win_Home','Win_Away']]

In [37]:
#pivot table to get team from each game on separate row, and include all win flags in one column
df_long = pd.wide_to_long(df_wins, ['Team','Num_Games','Win'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#calculate sum of won games and shift down 
df_long.loc[:,'Win_sum'] = df_long.groupby('Team').Win.transform(lambda x : x.cumsum().shift(1, fill_value = 0))

In [38]:
#function to calculate win percentage; set first game as 100% win (perfect record to start)
def win_pct(v):
    if v.Num_Games == 0:
        pct = 1
    else:
        pct = v.Win_sum/v.Num_Games
    return pct

In [39]:
#apply win percentage function to dataframe
df_long['Win_pct'] = df_long.apply(win_pct, axis=1)

#reset index and sort on date, game id
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

In [40]:
#get home and away running win pct for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'Win_pct')
df_pivot = df_pivot.rename(columns = {'Home':'Winpct_Home','Away':'Winpct_Away'})

In [41]:
#merge win percent with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [42]:
#build logit model
train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away','Winpct_Home','Winpct_Away']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away','Winpct_Home','Winpct_Away']]
X_test = sm.add_constant(X_test)

  return ptp(axis=axis, out=out, **kwargs)


In [43]:
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.624639
         Iterations 5


In [44]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7297.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.0943
Time:,00:50:10,Log-Likelihood:,-4562.4
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,5.518e-202

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0211,0.133,-0.159,0.874,-0.281,0.239
GDcumAvg_Home,0.3470,0.114,3.057,0.002,0.125,0.569
GDcumAvg_Away,-0.2630,0.116,-2.269,0.023,-0.490,-0.036
xGDiff_cumAvg_Home,0.7375,0.110,6.686,0.000,0.521,0.954
xGDiff_cumAvg_Away,-0.7109,0.115,-6.169,0.000,-0.937,-0.485
Winpct_Home,-0.4524,0.327,-1.382,0.167,-1.094,0.189
Winpct_Away,0.0363,0.322,0.113,0.910,-0.594,0.667


In [45]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.21591723312493333

### (iii) Historical Average Shots on Target Differential

In [46]:
#create copy to analyze shot on target differentials
df_st = df.copy()

#calculate goal differentials for home and away teams of each match
df_st.loc[:,'STDiff_Home'] = df_st.loc[:,'ST_Home']-df_st.loc[:,'ST_Away']
df_st.loc[:,'STDiff_Away'] = df_st.loc[:,'ST_Away']-df_st.loc[:,'ST_Home']

#keep only columns for goal differntial stats
df_st = df_st[['Date','Game_ID','Team_Home','Team_Away','STDiff_Home','STDiff_Away']]

In [47]:
#pivot table to get team from each game on separate row, and include all shot on target differentials in one column
df_long = pd.wide_to_long(df_st, ['Team','STDiff'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#cumulative avg of shot on target differential per team for all preceding games
df_long = df_long.assign(STDiffcumAvg = df_long.groupby('Team')['STDiff'].transform(lambda x: x.expanding().mean().shift(1, fill_value = 0)))

In [48]:
#get home and away shots on target diff cumavg for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = 'STDiffcumAvg')

df_pivot = df_pivot.rename(columns = {'Home':'STDiff_cumAvg_Home','Away':'STDiff_cumAvg_Away'})

In [49]:
#merge shots on target diff cum avg table with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [50]:
#build logit model

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away',]]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away',]]
X_test = sm.add_constant(X_test)

  return ptp(axis=axis, out=out, **kwargs)


In [51]:
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.624339
         Iterations 5


In [52]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7297.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.09474
Time:,00:50:10,Log-Likelihood:,-4560.2
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,6.231e-203

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1767,0.025,-7.015,0.000,-0.226,-0.127
GDcumAvg_Home,0.1938,0.092,2.107,0.035,0.014,0.374
GDcumAvg_Away,-0.2112,0.096,-2.207,0.027,-0.399,-0.024
xGDiff_cumAvg_Home,0.5711,0.136,4.187,0.000,0.304,0.838
xGDiff_cumAvg_Away,-0.5949,0.142,-4.182,0.000,-0.874,-0.316
STDiff_cumAvg_Home,0.0878,0.043,2.065,0.039,0.004,0.171
STDiff_cumAvg_Away,-0.0646,0.044,-1.484,0.138,-0.150,0.021


In [53]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.2158086295964073

### (iv) Win/Lose Streak

In [54]:
#pivot table to get team from each game on separate row, and include all win flags in one column
df_long = pd.wide_to_long(df_wins, ['Team','Num_Games','Win'], i = ['Game_ID'], 
                          j = 'isHome', sep = '_', suffix = r'\w+')

In [55]:
#reindex to get rid of multi-index
df_long = df_long.reset_index().sort_values(['Date','Game_ID']) 

#sort by team first
df_long = df_long.sort_values(['Team','Date','Game_ID'])

In [56]:
#create streak grouped by wins or losses per team
df_long['Streak'] = df_long['Win'].groupby((df_long['Win']!=df_long.groupby(['Team'])['Win'].shift()).cumsum()).cumcount()+1

#shift binary wins column down and set win to 0 when team has not played yet
df_long['Win_shift'] = df_long['Win'].shift(1,fill_value=0)
df_long['Win_shift'] = np.where(df_long.Num_Games == 0, 0, df_long.Win_shift)

#shift streaks column down and set streak to 0 when team has not played yet
df_long['Streak_shift'] = df_long['Streak'].shift(1,fill_value=0)
df_long['Streak_shift'] = np.where(df_long.Num_Games == 0, 0, df_long.Streak_shift)

#win streak based on num wins in a row
df_long['Win_streak'] = df_long['Win_shift']*df_long['Streak_shift']

#create loss shift (opposite of win shift)
df_long['Loss_shift'] = np.where(df_long.Win_shift == 0,1,0)

#loss streak based on num losses in a row
df_long['Loss_streak'] = df_long['Loss_shift']*df_long['Streak_shift']

In [57]:
#drop columns not needed
df_long = df_long.drop(columns = ['Num_Games','Win','Streak','Win_shift','Streak_shift','Loss_shift'])

In [58]:
#get home and away streaks for each game
df_pivot = df_long.pivot(index='Game_ID', columns = 'isHome',values = ['Win_streak','Loss_streak'])

#rename columns and drop hierarchical level
df_pivot.columns = pd.MultiIndex.from_tuples([
    ('Win_streak', 'Win_streak_Away'), ('Win_streak', 'Win_streak_Home'), 
    ('Loss_streak', 'Loss_streak_Away'),('Loss_streak', 'Loss_streak_Home')
])

#remove top level of hierarchical column naming
df_pivot = df_pivot.droplevel(0,axis=1)

In [59]:
#merge streaks with df containing all info
df = df.merge(df_pivot, on = ['Game_ID'])

In [60]:
#build logit model

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away','Loss_streak_Home','Loss_streak_Away']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away','Loss_streak_Home','Loss_streak_Away']]
X_test = sm.add_constant(X_test)

  return ptp(axis=axis, out=out, **kwargs)


In [61]:
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.623214
         Iterations 5


In [62]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7293.0
Method:,MLE,Df Model:,10.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.09637
Time:,00:50:10,Log-Likelihood:,-4552.0
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,3.444e-202

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1853,0.047,-3.915,0.000,-0.278,-0.093
GDcumAvg_Home,0.1464,0.094,1.561,0.119,-0.037,0.330
GDcumAvg_Away,-0.1672,0.098,-1.706,0.088,-0.359,0.025
xGDiff_cumAvg_Home,0.5623,0.136,4.124,0.000,0.295,0.830
xGDiff_cumAvg_Away,-0.6072,0.143,-4.260,0.000,-0.887,-0.328
STDiff_cumAvg_Home,0.0859,0.042,2.021,0.043,0.003,0.169
STDiff_cumAvg_Away,-0.0614,0.044,-1.411,0.158,-0.147,0.024
Win_streak_Home,0.0599,0.023,2.647,0.008,0.016,0.104
Win_streak_Away,-0.0342,0.022,-1.527,0.127,-0.078,0.010


In [63]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.21549980891234885

### (v) Favoured Team at Home

In [64]:
pd.set_option('display.max_columns', None)

In [65]:
def fave(v):
    diff = v.GDcumAvg_Home - v.GDcumAvg_Away
    if diff > 0:
        v = diff
    else:
        v = 0
    return v

In [66]:
df.loc[:,'Fav_at_Home'] = df.apply(fave, axis = 1)

In [67]:
#build logit model

train = df.loc[df.Y < 18]
test = df.loc[df.Y == 18]

y_train = train.y
X_train = train[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
                'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away','Loss_streak_Home','Loss_streak_Away',
                'Fav_at_Home']]
X_train = sm.add_constant(X_train)

y_test = test.y
X_test = test[['GDcumAvg_Home','GDcumAvg_Away','xGDiff_cumAvg_Home','xGDiff_cumAvg_Away',
              'STDiff_cumAvg_Home','STDiff_cumAvg_Away','Win_streak_Home','Win_streak_Away','Loss_streak_Home','Loss_streak_Away',
              'Fav_at_Home']]
X_test = sm.add_constant(X_test)

  return ptp(axis=axis, out=out, **kwargs)


In [68]:
result = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.623100
         Iterations 5


In [69]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7292.0
Method:,MLE,Df Model:,11.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.09654
Time:,00:50:11,Log-Likelihood:,-4551.1
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,1.526e-201

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.2199,0.054,-4.042,0.000,-0.327,-0.113
GDcumAvg_Home,0.0781,0.107,0.728,0.466,-0.132,0.288
GDcumAvg_Away,-0.1012,0.110,-0.917,0.359,-0.317,0.115
xGDiff_cumAvg_Home,0.5489,0.137,4.020,0.000,0.281,0.817
xGDiff_cumAvg_Away,-0.6190,0.143,-4.335,0.000,-0.899,-0.339
STDiff_cumAvg_Home,0.0868,0.042,2.043,0.041,0.004,0.170
STDiff_cumAvg_Away,-0.0637,0.044,-1.461,0.144,-0.149,0.022
Win_streak_Home,0.0583,0.023,2.567,0.010,0.014,0.103
Win_streak_Away,-0.0368,0.022,-1.642,0.101,-0.081,0.007


In [70]:
y_pred = result.predict(X_test)
brier_score_loss(y_test, y_pred)

0.21535934007200755