In [1]:
import pandas as pd
import numpy as np
from scipy.special import logit, expit
import statsmodels.formula.api as smf
from sklearn.metrics import brier_score_loss
import warnings
warnings.filterwarnings('ignore')
rng = np.random.default_rng(seed = 456)

# 1a. For each game compute the historical average goal differentials for each team

In [2]:
df = pd.read_csv('soccer18.csv', parse_dates = ['Date'])
df = df.replace('Evian Thonon Gaillard', 'Evian')
df['GameID'] = df.index
df['PD_H'] = df.FTHG - df.FTAG
df['PD_A'] = df.FTAG - df.FTHG
df = df.sort_values('Date')

In [3]:
df['homeWin'] = 1*(df.FTHG > df.FTAG)

In [4]:
df_melt = pd.melt(df, id_vars='GameID', value_vars=['HomeTeam', 'AwayTeam'], var_name='isHome', value_name='Team')
df_melt['isHome'] = np.where(df_melt.isHome =='HomeTeam', 'H', 'A')

In [5]:
df_melt2 = pd.melt(df, id_vars='GameID', value_vars=['PD_H', 'PD_A'], var_name='isHome', value_name='PD')
df_melt2['isHome'] = np.where(df_melt2.isHome =='PD_H', 'H', 'A')

In [6]:
df_merge = df_melt.merge(df_melt2, on=['GameID', 'isHome']).merge(df[['GameID', 'Date']], on='GameID').sort_values('Date')
df_merge['hAGD'] = df_merge.groupby('Team').PD.transform(lambda x : x.expanding().mean().shift(1, fill_value = 0))
df_merge['GP'] = df_merge.groupby('Team').PD.transform(lambda x : x.expanding().count().shift(1, fill_value = 0))

In [7]:
df_pivot = df_merge.pivot(index='GameID', columns='isHome')
df_pivot.columns = [f'{i}_{j}' for i, j in df_pivot.columns]
df_pivot = df_pivot.reset_index()
df_pivot['goalDisp'] = np.abs(df_pivot.hAGD_H - df_pivot.hAGD_A)
df_pivot = df[['GameID', 'Div', 'Y', 'HomeTeam', 'AwayTeam']].merge(
    df_pivot[['GameID', 'hAGD_H', 'hAGD_A','GP_H', 'GP_A', 'goalDisp']], on='GameID')

In [8]:
train = df_pivot[df_pivot.Y < 18]
test = df_pivot[df_pivot.Y == 18]

## i. Give a table containing the 7 games with the largest absolute disparity

In [9]:
train.sort_values('goalDisp', ascending=False).head(7).drop('GameID', 1).reset_index(drop=True)

Unnamed: 0,Div,Y,HomeTeam,AwayTeam,hAGD_H,hAGD_A,GP_H,GP_A,goalDisp
0,Serie_A,14,Sassuolo,Sampdoria,-3.5,1.0,2,2,4.5
1,Ligue_1,14,Evian,Paris SG,-3.5,1.0,2,2,4.5
2,Ligue_1,17,Strasbourg,Lille,-4.0,0.078261,1,115,4.078261
3,Serie_A,14,Palermo,Inter,-0.5,3.5,2,2,4.0
4,La_Liga,14,Cordoba,Celta,-2.0,2.0,1,1,4.0
5,Serie_A,14,Empoli,Roma,-2.0,2.0,1,1,4.0
6,La_Liga,14,Elche,Granada,-3.0,1.0,1,1,4.0


## ii. Repeat the previous part restricted to games where each team had previously played at least 100 games in our dataset (that is, 100 or more)

In [10]:
train.loc[(train.GP_H >= 100) & (train.GP_A >= 100)].sort_values('goalDisp', ascending=False).head(7).drop('GameID', 1).reset_index(drop=True)

Unnamed: 0,Div,Y,HomeTeam,AwayTeam,hAGD_H,hAGD_A,GP_H,GP_A,goalDisp
0,La_Liga,16,Granada,Barcelona,-0.875,2.192308,104,104,3.067308
1,La_Liga,17,Levante,Barcelona,-0.705357,2.14,112,150,2.845357
2,La_Liga,16,Granada,Real Madrid,-0.936937,1.9,111,110,2.836937
3,La_Liga,17,Las Palmas,Barcelona,-0.623762,2.208633,101,139,2.832395
4,La_Liga,17,La Coruna,Barcelona,-0.621622,2.142857,148,147,2.764479
5,La_Liga,16,La Coruna,Barcelona,-0.519608,2.22549,102,102,2.745098
6,La_Liga,17,Barcelona,La Coruna,2.186047,-0.527132,129,129,2.713178


## iii. Almost all games in the solution to part (i) come from the 2014 season (the first season in our dataset), but one comes from the 2017 season. In a few words, explain what is special about it

For the game in 2017, Strasbourg played it's first game since being relegated in the 2007-08 season. Since the data we have starts in 2014, it is considered their first game in the data set, thus why the average point differential is so high.


# 1b. Fit a logit model to predict the probability of the home team winning (draws count as non-wins) using only an intercept term.

In [11]:
df = pd.merge(df, df_pivot[['GameID', 'hAGD_H', 'hAGD_A']], how='left')
train = df[df.Y < 18]
test = df[df.Y == 18]

In [12]:
result = smf.logit('homeWin ~ 1', data = train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.689679
         Iterations 3


0,1,2,3
Dep. Variable:,homeWin,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7303.0
Method:,MLE,Df Model:,0.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,4.042e-12
Time:,18:42:47,Log-Likelihood:,-5037.4
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1669,0.023,-7.106,0.000,-0.213,-0.121


## i. Report your coefficient value.

In [13]:
print('The coefficient is:', result.params.values[0])

The coefficient is: -0.16687026113323677


## ii. Report the Brier score of your out-of-sample predictions on 2018 (Y=18)

In [14]:
y_pred = result.predict(test)
print('Brier Score:', brier_score_loss(test['homeWin'], y_pred))

Brier Score: 0.2473559477379797


# 1c. The intercept coefficient from the previous part is negative. Does this imply there is no home field advantage? In other words, if home teams are favored, shouldn’t the intercept be positive?

> The intercept and the logit model as a whole cannot be used to interpret probability. To do that we have to implement the expit function in order to translate the probabilities and whether or not there is home field advantage. Since the expit function exponentiates the values given to create a probability the resulting value will always be between 0 and 1. 

# 1d. Repeat part (b) using the intercept, and the historical average goal differentials from each team as features (three features in total)

In [15]:
result = smf.logit('homeWin ~ hAGD_H + hAGD_A + 1', data = train).fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.630677
         Iterations 5


0,1,2,3
Dep. Variable:,homeWin,No. Observations:,7304.0
Model:,Logit,Df Residuals:,7301.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.08555
Time:,18:42:47,Log-Likelihood:,-4606.5
converged:,True,LL-Null:,-5037.4
Covariance Type:,nonrobust,LLR p-value:,6.932999999999999e-188

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1791,0.025,-7.183,0.000,-0.228,-0.130
hAGD_H,0.7853,0.039,20.128,0.000,0.709,0.862
hAGD_A,-0.7619,0.040,-19.082,0.000,-0.840,-0.684


In [16]:
print('The coefficients are:', result.params.values)

The coefficients are: [-0.17910355  0.78534468 -0.76193982]


In [17]:
y_pred = result.predict(test)
print('Brier Score:', brier_score_loss(test['homeWin'], y_pred))

Brier Score: 0.21726101075298782
