In [1]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.special import logit, expit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss

import warnings
warnings.filterwarnings('ignore')
rng = np.random.default_rng(seed = 456)

  import pandas.util.testing as tm


In [2]:
def to_df(X, y):
  df = X.copy()
  df['homeWin'] = y
  return df

def logit(X_train, y_train, X_test, y_test):
  f = y_train.name + ' ~ ' + ' + '.join([col for col in X_train.columns])
  result = smf.logit(f, data=to_df(X_train, y_train)).fit()
  y_pred = result.predict(X_test)
  return brier_score_loss(y_test, y_pred)


In [3]:
df = pd.read_csv('soccer18m.csv', parse_dates = ['Date'])
df = df.replace('Evian Thonon Gaillard', 'Evian')
df['GameID'] = df.index
df = df.sort_values('Date')
df['abs_dG'] = np.abs(df.FTHG - df.FTAG)
df['Elo_H'] = 1000
df['Elo_A'] = 1000
df

Unnamed: 0,Div,Date,Y,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,pH,pD,pA,home_xG,away_xG,GameID,abs_dG,Elo_H,Elo_A
5306,Ligue_1,2014-08-08,14,Reims,Paris SG,2,2,2,1,9,16,3,6,0.089841,0.196675,0.713484,1.367870,2.655380,5306,0,1000,1000
5313,Ligue_1,2014-08-09,14,Nice,Toulouse,3,2,1,2,22,11,8,3,0.342193,0.315615,0.342193,2.214810,1.563610,5313,1,1000,1000
5312,Ligue_1,2014-08-09,14,Nantes,Lens,1,0,0,0,14,5,4,2,0.478505,0.297607,0.223888,1.025000,0.167128,5312,1,1000,1000
5311,Ligue_1,2014-08-09,14,Montpellier,Bordeaux,0,1,0,1,15,7,3,3,0.386037,0.317102,0.296861,1.017560,0.750184,5311,1,1000,1000
5307,Ligue_1,2014-08-09,14,Bastia,Marseille,3,3,1,2,13,9,4,4,0.245226,0.292215,0.462559,1.396890,1.317510,5307,0,1000,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9080,Serie_A,2019-05-26,18,Fiorentina,Genoa,0,0,0,0,5,2,5,1,0.383114,0.400529,0.216357,0.296039,0.074391,9080,0,1000,1000
9081,Serie_A,2019-05-26,18,Inter,Empoli,2,1,0,0,20,9,15,5,0.695512,0.180703,0.123785,2.743020,1.772750,9081,1,1000,1000
9082,Serie_A,2019-05-26,18,Roma,Parma,2,1,1,0,16,9,8,5,0.803027,0.128484,0.068488,1.722810,1.089380,9082,1,1000,1000
9083,Serie_A,2019-05-26,18,Sampdoria,Juventus,2,0,0,0,10,6,3,1,0.249003,0.245249,0.505747,0.686933,0.487175,9083,2,1000,1000


In [4]:
def calc_elo(abs_dG, W, eloF, eloA, isHome, K = 40, HFA = 100):
  G = 1 if abs_dG <= 1 else (3/2 if abs_dG == 2 else (11+abs_dG)/8)
  dr = (eloF - eloA) + (HFA if isHome else -HFA)
  W_e = 1 / (10**(-dr/400) + 1)
  return K*G*(W - W_e)

In [5]:
# Create a dictionary with moving elo values initialized at 1000
elos = {i:1000 for i in df.HomeTeam.unique()}

# End ratings after the 17 season is over
df17 = df.loc[df.Y < 18].reset_index(drop=True)

for i in range(len(df17)):
  df17.loc[i, 'Elo_H'] = elos[df17.loc[i, 'HomeTeam']]
  df17.loc[i, 'Elo_A'] = elos[df17.loc[i, 'AwayTeam']]
  W = 1 if df17.loc[i, 'FTHG'] > df17.loc[i, 'FTAG'] else (0 if df17.loc[i, 'FTAG'] > df17.loc[i, 'FTHG'] else 0.5)
  P = calc_elo(df17.loc[i, 'abs_dG'], W, df17.loc[i, 'Elo_H'], df17.loc[i, 'Elo_A'], 1)
  elos[df17.loc[i, 'HomeTeam']] += P
  elos[df17.loc[i, 'AwayTeam']] -= P



In [6]:
output = pd.DataFrame.from_dict(elos, orient='index').reset_index()
output = output.rename(columns={'index': 'Team', 0: 'Elo'})
divs = df.groupby(['Div', 'HomeTeam']).count().reset_index()[['Div', 'HomeTeam']].rename(columns={'HomeTeam': 'Team'})
output = pd.merge(output, divs, how='left')
output = output[['Div', 'Team', 'Elo']]

In [7]:
output.sort_values(by=['Div','Elo'], ascending=[True, False]).groupby('Div').head(3).reset_index(drop=True)

Unnamed: 0,Div,Team,Elo
0,Bundesliga,Bayern Munich,1350.621424
1,Bundesliga,Schalke 04,1159.177933
2,Bundesliga,Hoffenheim,1142.152808
3,EPL,Man City,1429.6594
4,EPL,Tottenham,1283.911192
5,EPL,Man United,1258.73246
6,La_Liga,Barcelona,1415.462495
7,La_Liga,Real Madrid,1306.832652
8,La_Liga,Ath Madrid,1220.575107
9,Ligue_1,Paris SG,1352.520538


# 2a

In [8]:
df = pd.read_csv('soccer18m.csv', parse_dates = ['Date'])
df = df.replace('Evian Thonon Gaillard', 'Evian')
df['GameID'] = df.index
df = df.sort_values('Date')

In [9]:
df['upsetWin'] = 1*((df.pH < df.pA) & (df.FTHG > df.FTAG)) + 1*((df.pA < df.pH) & (df.FTAG > df.FTHG))
df2 = df.loc[(df.Y < 18) & (df.upsetWin == 1)]

In [10]:
df_melt = pd.melt(df2, id_vars='GameID', value_vars=['pH', 'pA'], var_name='isHome', value_name='pW')
df_melt['isHome'] = np.where(df_melt.isHome =='pH', 1, 0)
df_melt = df_melt.sort_values('pW').head(7)
output = df_melt[['GameID']].merge(df[['GameID', 'Div', 'Y', 'HomeTeam', 'AwayTeam', 'pH', 'pA', 'FTHG', 'FTAG']], how='left')
output

Unnamed: 0,GameID,Div,Y,HomeTeam,AwayTeam,pH,pA,FTHG,FTAG
0,2677,La_Liga,16,Barcelona,Alaves,0.891147,0.028831,1,2
1,2128,La_Liga,14,Barcelona,Malaga,0.875453,0.040021,0,1
2,1988,La_Liga,14,Barcelona,Celta,0.861781,0.043664,0,1
3,4291,Bundesliga,15,Bayern Munich,Mainz,0.85692,0.044404,1,2
4,3081,La_Liga,17,Real Madrid,Betis,0.876513,0.048646,0,1
5,2641,La_Liga,15,Levante,Ath Madrid,0.052018,0.798875,2,1
6,4008,Bundesliga,14,Bayern Munich,M'gladbach,0.821218,0.054292,0,2


# 2b

In [None]:
df['homeWin'] = 1*(df.FTHG > df.FTAG)
df17 = df17.rename({'HomeTeam': 'Home_Team', 'AwayTeam': 'Away_Team'})
df17 = df.loc[df.Y < 18]

In [12]:
df_melt = pd.melt(df17, id_vars='GameID', value_vars=['HomeTeam', 'AwayTeam'], var_name='isHome', value_name='Team')
df_melt['isHome'] = np.where(df_melt.isHome =='HomeTeam', 'H', 'A')
df_merge = df_melt.merge(df17[['GameID', 'Date', 'Y', 'pH', 'homeWin']], how='left').sort_values('Date')
df_merge['GP'] = df_merge.groupby(['Y', 'Team']).Team.transform(lambda x : x.expanding().count().shift(1, fill_value = 0))

In [13]:
df_pivot = df_merge.pivot(index='GameID', columns='isHome')
df_pivot.columns = [f'{i}_{j}' for i, j in df_pivot.columns]
df_pivot = df_pivot.reset_index()
df_pivot = df.merge(df_pivot[['GameID','GP_H', 'GP_A']], on='GameID')

df_5g = df_pivot[(df_pivot.GP_H < 5) & (df_pivot.GP_A < 5)]

In [14]:
print('Brier Score (Whole Season):\t', brier_score_loss(df17.homeWin, df17.pH))
print('Brier Score (less than 5GP):\t', brier_score_loss(df_5g.homeWin, df_5g.pH))

Brier Score (Whole Season):	 0.2106061594649744
Brier Score (less than 5GP):	 0.21058076780385268


In [15]:
df_4g = df_pivot[(df_pivot.GP_H < 4) & (df_pivot.GP_A < 4)]
print('Brier Score (less than 4GP):\t', brier_score_loss(df_4g.homeWin, df_4g.pH))

Brier Score (less than 4GP):	 0.20838970709761645
