<a href="https://colab.research.google.com/github/abhidendukuri/Sports_Modeling/blob/HW3/HW3_1c_2c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import brier_score_loss

import warnings
warnings.filterwarnings('ignore')
rng = np.random.default_rng(seed = 456)

  import pandas.util.testing as tm


In [2]:
def to_df(X, y):
  df = X.copy()
  df['homeWin'] = y
  return df

def calc_elo(abs_dG, W, eloF, eloA, isHome, K = 40, HFA = 100):
  G = 1 if abs_dG <= 1 else (3/2 if abs_dG == 2 else (11+abs_dG)/8)
  dr = (eloF - eloA) + (HFA if isHome else -HFA)
  W_e = 1 / (10**(-dr/400) + 1)
  return K*G*(W - W_e)

def logit(X_train, y_train, X_test, y_test):
  f = y_train.name + ' ~ ' + ' + '.join([col for col in X_train.columns])
  result = smf.logit(f, data=to_df(X_train, y_train)).fit()
  y_pred = result.predict(X_test)
  return brier_score_loss(y_test, y_pred)


# 1c

In [3]:
df = pd.read_csv('soccer18_hs.csv', parse_dates = ['Date']).drop(['Unnamed: 0', 'home_win_pct'], 1)

In [4]:
# Drop GameID and Y after train/test splits
X_train, y_train = df[df.Y < 18].drop('homeWin', 1), df[df.Y < 18].homeWin
X_test, y_test = df[df.Y == 18].drop('homeWin', 1), df[df.Y == 18].homeWin
X_train = X_train.drop(['Y'], 1).select_dtypes(include=np.number)
X_test = X_test.drop(['Y'], 1).select_dtypes(include=np.number)

lr_noElo = logit(X_train, y_train, X_test, y_test)

Optimization terminated successfully.
         Current function value: 0.623617
         Iterations 5


In [5]:
df2 = pd.read_csv('soccer18m.csv', parse_dates = ['Date'])
df2['GameID'] = df2.index
df2 = df2.sort_values('Date')
df2['abs_dG'] = np.abs(df2.FTHG - df2.FTAG)
df2['Elo_H'] = 1000
df2['Elo_A'] = 1000

In [6]:
elos = {i:1000 for i in df2.HomeTeam.unique()}

for i in range(len(df2)):
  df2.loc[i, 'Elo_H'] = elos[df2.loc[i, 'HomeTeam']]
  df2.loc[i, 'Elo_A'] = elos[df2.loc[i, 'AwayTeam']]
  W = 1 if df2.loc[i, 'FTHG'] > df2.loc[i, 'FTAG'] else (0 if df2.loc[i, 'FTAG'] > df2.loc[i, 'FTHG'] else 0.5)
  P = calc_elo(df2.loc[i, 'abs_dG'], W, df2.loc[i, 'Elo_H'], df2.loc[i, 'Elo_A'], 1)
  elos[df2.loc[i, 'HomeTeam']] += P
  elos[df2.loc[i, 'AwayTeam']] -= P

df2['EloD'] = df2.Elo_H - df2.Elo_A
df = df.merge(df2[['GameID', 'EloD']], how='left')
df_hs = df.select_dtypes(include=np.number)

In [7]:
# Drop GameID and Y after train/test splits
X_train, y_train = df_hs[df_hs.Y < 18].drop('homeWin', 1), df_hs[df_hs.Y < 18].homeWin
X_test, y_test = df_hs[df_hs.Y == 18].drop('homeWin', 1), df_hs[df_hs.Y == 18].homeWin
X_train = X_train.drop(['GameID', 'Y'], 1).select_dtypes(include=np.number)
X_test = X_test.drop(['GameID', 'Y'], 1).select_dtypes(include=np.number)

lr_elo = logit(X_train, y_train, X_test, y_test)

Optimization terminated successfully.
         Current function value: 0.621370
         Iterations 5


In [8]:
print('Brier Score Without Elo:', lr_noElo)
print('Brier Score Elo:\t', lr_elo)

Brier Score Without Elo: 0.2150296052121065
Brier Score Elo:	 0.21416889370900888


# 2c

In [9]:
df_melt = pd.melt(df2, id_vars='GameID', value_vars=['HomeTeam', 'AwayTeam'], var_name='isHome', value_name='Team')
df_melt['isHome'] = np.where(df_melt.isHome =='HomeTeam', 'H', 'A')
df_melt2 = pd.melt(df2, id_vars='GameID', value_vars=['pH', 'pA'], var_name='isHome', value_name='pW')
df_melt2['isHome'] = np.where(df_melt2.isHome =='pH', 'H', 'A')
df_merge = df_melt.merge(df_melt2, on=['GameID', 'isHome']).merge(df2[['GameID', 'Date', 'Y', 'pD']], on='GameID').sort_values('Date')
df_merge['hpW'] = df_merge.groupby(['Team']).pW.transform(lambda x : x.expanding().mean().shift(1, fill_value = 0))
df_merge['hpD'] = df_merge.groupby('Team').pD.transform(lambda x : x.expanding().mean().shift(1, fill_value = 0))

df_pivot = df_merge.pivot(index='GameID', columns='isHome')
df_pivot.columns = [f'{i}_{j}' for i, j in df_pivot.columns]
df_pivot = df_pivot.reset_index()

df_pivot = df[['GameID', 'Div', 'Y', 'HomeTeam', 'AwayTeam']].merge(df_pivot[['GameID', 'hpW_H', 'hpW_A', 'hpD_H']], on='GameID')
df_hs = pd.merge(df_hs, df_pivot[['GameID', 'hpW_H', 'hpW_A', 'hpD_H']], how='left')
df_hs = df_hs.rename(columns={'hpW_H': 'hpH', 'hpW_A': 'hpA', 'hpD_H': 'hpD'})

In [10]:
# Drop GameID and Y after train/test splits
X_train, y_train = df_hs[df_hs.Y < 18].drop('homeWin', 1), df_hs[df_hs.Y < 18].homeWin
X_test, y_test = df_hs[df_hs.Y == 18].drop('homeWin', 1), df_hs[df_hs.Y == 18].homeWin
X_train = X_train.drop(['GameID', 'Y'], 1).select_dtypes(include=np.number)
X_test = X_test.drop(['GameID', 'Y'], 1).select_dtypes(include=np.number)

lr_pW = logit(X_train, y_train, X_test, y_test)

Optimization terminated successfully.
         Current function value: 0.615825
         Iterations 5


In [11]:
# Drop GameID and Y after train/test splits
X_train, y_train = df_hs[df_hs.Y < 18].drop('homeWin', 1), df_hs[df_hs.Y < 18].homeWin
X_test, y_test = df_hs[df_hs.Y == 18].drop('homeWin', 1), df_hs[df_hs.Y == 18].homeWin

# Removed pA and pH
X_train = X_train.drop(['GameID', 'Y', 'hpA', 'hpD'], 1).select_dtypes(include=np.number)
X_test = X_test.drop(['GameID', 'Y', 'hpA', 'hpD'], 1).select_dtypes(include=np.number)

lr_pH = logit(X_train, y_train, X_test, y_test)

Optimization terminated successfully.
         Current function value: 0.618010
         Iterations 5


In [12]:
print('Brier Score with pH, pD, pA:\t', lr_pW)
print('Brier Score with only pH:\t', lr_pH)

Brier Score with pH, pD, pA:	 0.21434654476925655
Brier Score with only pH:	 0.2132951807740099
