# Objectives

* Formalize win probability model
* Estimate and predict multinomial logitistic regresison models
* Implement Bayes' rule

## NHL In-Game Win Probability Model

Estimate the following model:
    
$$ p(hw_g| score_{g,t} ) = \frac{p(hg_g| score_{g,t} ) p(hw_g)}{ p(score_{g,t} | hw_g) p(hw_g) +  p(score_{g,t} | \bar{hw_g}) p( \bar{hw_g}) } $$

where,

* $(hw_g| score_{g,t})$: posterior probability of a home team win given the score differential game state
* $p(hw_g$: prior probability of a home team win
* $p(score_{g,t} |hw_g)$ and $p(score_{g,t}  | \bar{hw_g})$: inverse conditional probabilities

In [None]:
# modules
import sys
import os
import pandas
import numpy
import datetime, time

import matplotlib.pyplot as plt
%matplotlib notebook

import statsmodels.api as sm
from pylab import hist, show
import scipy
import statsmodels.api as sm

pandas.set_option("display.max_rows", 35)
pandas.set_option("display.max_columns", 50)
pandas.set_option("display.max_colwidth", 200)



In [None]:
dh = pandas.read_csv('2014Events.csv')
dh = dh.drop(['playernumber', 'toirank', 'eventtype', 'zone', 'name', 'advantagetype', 'position'], axis=1)
# smaller development dataframe to decrease execution time
#dh = dh[dh['gamenumber']<=20200]
print (len(dh))
dh.head()

In [None]:
dh['secStart'] = ((dh['period']-1)*1200 + dh['eventtimefromzero'])
dh = dh.sort_values(by=['season', 'gamenumber', 'secStart'], ascending=[1, 1, 1])
dh['minStart'] = (dh['secStart']/60).astype(int)
dh.describe()

In [None]:
# in-game information relative to home team
dh['homeScore'] = dh.apply(lambda x: x['tgoals'] if x['isthome'] == 1 else x['ogoals'], axis=1)
# dh['homeScore'] = numpy.where(dh.isthome==1, dh.tgoals, dh.ogoals)
dh['awayScore'] = dh.apply(lambda x: x['tgoals'] if x['isthome'] == 0 else x['ogoals'], axis=1)
dh['homeSM']    = dh['homeScore'] - dh['awayScore']
dh['homeWin'] = dh.apply(lambda x: 1 if x['winteamcode'] == x['hteamcode'] else 0, axis=1)
dh['homeSM'].value_counts()

In [None]:
dh.head()

Obtain game results

In [None]:
df = pandas.DataFrame(dh.groupby(['gamenumber'], axis=0,  as_index=False)['homeSM'].last())
df = df.rename(columns={'homeSM' : 'finalHSM'})
df.head()

In [None]:
len(df)

Merge game and game-event data

In [None]:
dh = pandas.merge(dh, df, left_on='gamenumber', right_on='gamenumber', how='outer')
dh.head(2)

Generage in-game information relative to home team

In [None]:
# can use numpy where command
dh['homeScore'] = dh.apply(lambda x: x['tgoals'] if x['isthome'] == 1 else x['ogoals'], axis=1)
dh['awayScore'] = dh.apply(lambda x: x['tgoals'] if x['isthome'] == 0 else x['ogoals'], axis=1)
dh['homeSM']    = dh['homeScore'] - dh['awayScore']
dh['homeWin'] = dh.apply(lambda x: 1 if x['winteamcode'] == x['hteamcode'] else 0, axis=1)
dh.head()

In [None]:
dh.head(20)

Create game-minute observations

In [None]:
dm = pandas.DataFrame(dh.groupby(['gamenumber', 'minStart'], axis=0, as_index=False).first())
dm = dm[dm['minStart']!=0]
dm = dm[dm['period']<=3]
dm.head(20)

Calculate team win percentages by date for game specific priors

In [None]:
dr = pandas.DataFrame()
ds = dh[['gamedate', 'gamenumber', 'hteamcode', 'vteamcode', 'winteamcode', 'isOTWin', 'isSOWin']].groupby(['gamenumber'], as_index=False).first()
teamList = ds['hteamcode'].unique()
teamList

For loop to calclulate winning percentage prior to game

In [None]:
for team in teamList:
    du = ds[(ds['hteamcode'] == team) | (ds['vteamcode'] == team)].copy()
    du.loc[::,'team'] = team
    du.loc[::,'wins']   = du.apply(lambda x: 1 if x['winteamcode'] == team else 0, axis=1).shift(1).cumsum()
    du.loc[::,'loss']   = du.apply(lambda x: 1 if x['winteamcode'] != team else 0, axis=1).shift(1).cumsum()
    dr = dr.append(du)
    print ('completed loop for ' + team)
dr.loc[::, 'winPer'] = dr['wins'] /(dr['wins'] + dr['loss'])

In [None]:
dr.head(20)

In [None]:
# merge 
dr = dr[['gamenumber', 'team', 'winPer']]
dm = pandas.merge(dm,dr,left_on=['gamenumber', 'hteamcode'], right_on=['gamenumber', 'team'])
dm = dm.rename(columns={'winPer':'homeWinPer'})
dm = pandas.merge(dm,dr,left_on=['gamenumber', 'vteamcode'], right_on=['gamenumber', 'team'])
dm = dm.rename(columns={'winPer':'awayWinPer'})
dm = dm.drop(['team_x', 'team_y'], axis=1)

dm.head(2)
# completed minute-level data'

## Score margin summary analysis

In [None]:
ds = pandas.DataFrame()
ds['trDScLaActu'] = dm[dm['homeWin']==1].groupby(['minStart'])['homeSM'].agg({'mean' : numpy.mean})
ds['faDScLaActu'] = dm[dm['homeWin']==0].groupby(['minStart'])['homeSM'].agg({'mean' : numpy.mean})
ds = ds.dropna(axis=0)
DS = ds.copy(); del ds
DS.head()

## Plot mean inverse conditionals

In [None]:
# create index to plot
ds = DS.copy()
ds = ds.reset_index()
ds = ds.rename(columns={'index' : 'minStart'})
ds = ds[ds['minStart']!=0]

# line plot
tempName = 'condLeagueScoreMarginsActu.pdf'
tempFile = os.path.join(tempName)

fig, ax0 = plt.subplots(facecolor='white')
ax0.plot(ds.index, ds['trDScLaActu'], label='Actual'  , marker='', linestyle='-', linewidth=1, color='black')
ax0.plot(ds.index, ds['faDScLaActu'], label='', marker='', linestyle='-', linewidth=1, color='black')
ax0.set_title('', fontsize=8)
ax0.set_xlabel('Minutes from start of game', fontsize=8)
ax0.tick_params(axis='both', labelsize=8)
ax0.set_ylabel('Home team score-margin', fontsize=8)
ax0.set_ylim([-3, 3])
ax0.axhline(y=0,xmin=0,xmax=60,c="black",linewidth=1.0, linestyle='-', zorder=0)
legend = ax0.legend(loc='lower left', shadow=False, fontsize=8, frameon=False)
ax0.text(40,2.5, 'Home team wins', fontsize=10)
ax0.text(40,-2.5, 'Home team losses', fontsize=10)
fig.suptitle("Game progression average score-margins", fontsize=10)
plt.savefig(tempFile)
# plt.close('all')
plt.plot()

## Estimation procedure
* Estimate inverse conditional probabilities; p(HSM|TR)

In [None]:
# adjust score margin states
dm['homeSM'] = dm.apply(lambda x: 3  if x['homeSM'] > 3 else  x['homeSM'], axis=1)
dm['homeSM'] = dm.apply(lambda x: -3 if x['homeSM'] < -3 else x['homeSM'], axis=1)
dm['homeSM'].describe()

# create win and lose specific data sets
dw = dm[dm['homeWin']==1]
dl = dm[dm['homeWin']==0]

### Multinomial Logitisitc regression

In [None]:
trMNL   = sm.MNLogit(dw['homeSM'], sm.add_constant(dw['minStart'])).fit()
trMNL.summary()

In [None]:
faMNL   = sm.MNLogit(dl['homeSM'], sm.add_constant(dl['minStart'])).fit()
faMNL.summary()

### Predict probabilities of score states

In [None]:
de1 = pandas.DataFrame(trMNL.predict(sm.add_constant(dm['minStart'])))
de0 = pandas.DataFrame(faMNL.predict(sm.add_constant(dm['minStart'])))
# de0.head(10)

In [None]:
de = de1/de0
de.head()

In [None]:
# change column names to be consistent with score states
de.columns = de.columns - 3
de.head()

Merge inverse conditional data frames

In [None]:
dn = pandas.merge(dm, de, left_index=True,right_index=True)
dn.head(20)

In [None]:
## obtained game-minute specific inverse conditional
dm = dn.copy()
# create indicator variables
dm['d3'] = dm.apply(lambda x: 1 if x['homeSM']==-3 else 0, axis=1)
dm['d2'] = dm.apply(lambda x: 1 if x['homeSM']==-2 else 0, axis=1)
dm['d1'] = dm.apply(lambda x: 1 if x['homeSM']==-1 else 0, axis=1)
dm['ti'] = dm.apply(lambda x: 1 if x['homeSM']==0  else 0, axis=1)
dm['u1'] = dm.apply(lambda x: 1 if x['homeSM']==1  else 0, axis=1)
dm['u2'] = dm.apply(lambda x: 1 if x['homeSM']==2  else 0, axis=1)
dm['u3'] = dm.apply(lambda x: 1 if x['homeSM']==3  else 0, axis=1)
dm.head(20)

In [None]:
# multiple inverse conditionals and indicators
dm['invCond']      = dm['d3']*dm[-3] + dm['d2']*dm[-2] + dm['d1']*dm[-1] + dm['ti']*dm[0] + dm['u1']*dm[1] + dm['u2']*dm[2] + dm['u3']*dm[3]
dm.head()

In [None]:
# set prior odds and probabilities
dm['priorProb']     = (dm['homeWinPer'] / (dm['homeWinPer'] + dm['awayWinPer'])).fillna(0.5)
dm['priorOdds']     = (dm['homeWinPer'] / dm['awayWinPer']).fillna(1.0)
dm.head(2)

Calculate posterior odds and probabilities (Bayes Rule)

In [None]:
dm['postOdds']      =  dm['priorOdds'] * dm['invCond']
dm['postProb']      =  dm['postOdds'] / (1 + dm['postOdds'])
dm.head(10)

Game-specific plot of in-game win probabilities

In [None]:
# set gamenumber
d1 = dm[dm['gamenumber']==20801]
print(len(d1))
d1[['gamenumber', 'minStart', 'homeSM', 'priorProb', 'postProb' ]].head()

In [None]:
print ("date, away team, home team")
print (d1.iloc[0]['gamedate'])
print (d1.iloc[0]['vteamcode'])
print (d1.iloc[0]['hteamcode'])
d1.head()

In [None]:
# plot 
plt.rc('axes', grid=False)

width = 0.001
space = width + 0.10
left, width = 0.1, 0.8
rect1 = [left, 0.4, width, 0.5]
rect2 = [left, 0.1, width, 0.2]

fig = plt.figure(facecolor='white')
ax1 = fig.add_axes(rect1, facecolor='white')
ax2 = fig.add_axes(rect2, facecolor='white', sharex=ax1)

ax1.tick_params(axis='y', labelsize=8)
ax1.tick_params(axis='x', labelsize=8)
ax2.tick_params(axis='y', labelsize=8)
ax2.tick_params(axis='x', labelsize=8)
ax1.set_ylim([0,1])
ax2.set_ylim([-6,6])
ax1.set_xlim([0,60])

# ig.suptitle(gameTitle, fontsize=12)
ax1.set_title("Game progression win probabilities", fontsize=12)
ax1.set_ylabel('Home team win probability', fontsize=8)
ax2.set_title('Game state information', fontsize=10, multialignment='center')
ax2.set_ylabel('Home team score-margin', fontsize=8, multialignment='center')
ax2.set_xlabel('Minutes from start of the game.', fontsize=8)

# plot lines
ax1.plot(d1['minStart'], d1['postProb']  , marker='', linestyle='-' , linewidth=2, color='black',  label='Home team win probability')
ax1.axhline(y=.5 ,c="black",linewidth=0.25, linestyle='--', zorder=0)
ax2.plot(d1.minStart, d1.homeSM, '.', linewidth=0.5, color='black')

legend = ax1.legend(loc='lower left', shadow=False, fontsize=6, frameon=False)

# save figure
plt.savefig("gn500.pdf", format='pdf')
# plt.close('all')
plt.show()

## complete

### Done