<a href="https://colab.research.google.com/github/ceb263/nhl/blob/main/game_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports and input data
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import itertools

from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression

from keras.layers.core import Dense, Activation, Dropout
from keras.models import Sequential, load_model, Model
from keras.layers import Conv2D, MaxPooling2D, Input, Flatten, AveragePooling2D
from keras.layers.merge import concatenate

%matplotlib inline

pd.set_option('display.max_rows', 150)

In [None]:
# if this already exists, skip the data aggregation section
teamGame = pd.read_csv('teamGame.gz')
playerGame = pd.read_csv('playerGame.gz')

In [None]:
lines = pd.read_csv('lines.csv')

In [None]:
lines['Team'] = lines['Team'].replace({
    'Pittsburgh':'PIT','TampaBay':'TBL','SeattleKraken':'SEA','Vegas':'VGK','NYRangers':'NYR','Washington':'WSH','Montreal':'MTL','Toronto':'TOR',
    'Vancouver':'VAN','Edmonton':'EDM','Chicago':'CHI','Colorado':'COL','Winnipeg':'WPG','Anaheim':'ANA','Ottawa':'OTT','Buffalo':'BUF',
    'Florida':'FLA','NYIslanders':'NYI','Carolina':'CAR','Dallas':'DAL','Arizona':'ARI','Columbus':'CBJ','Detroit':'DET','Nashville':'NSH',
    'LosAngeles':'LAK','NewJersey':'NJD','Philadelphia':'PHI','Minnesota':'MIN','Boston':'BOS','St.Louis':'STL','Calgary':'CGY','SanJose':'SJS',
    'Tampa Bay':'TBL','NY Islanders':'NYI','St. Louis':'STL','NY Rangers':'NYR','San Jose':'SJS','New Jersey':'NJD','Seattle Kraken':'SEA',
    'Los Angeles':'LAK','Arizonas':'ARI','Tampa':'TBL','Phoenix':'ARI'
})
lines.to_csv('lines.csv', index=False)

# Data Aggregation

In [None]:
# read data
pbp = pd.read_csv('pbp_with_xG.gz')

In [None]:
# fix ARZ/PHX
pbp['Ev_Team'] = pbp['Ev_Team'].replace({'PHX':'ARI', 'S.J':'SJS', 'L.A':'LAK', 'T.B':'TBL'})
pbp['Home_Team'] = pbp['Home_Team'].replace({'PHX':'ARI', 'S.J':'SJS', 'L.A':'LAK', 'T.B':'TBL'})
pbp['Away_Team'] = pbp['Away_Team'].replace({'PHX':'ARI', 'S.J':'SJS', 'L.A':'LAK', 'T.B':'TBL'})

In [None]:
# calculate some stats in the disaggregated data
pbp['Goals'] = ((pbp['Event'] == 'GOAL') & (pbp['Seconds_Elapsed']!=0)).astype(np.int16)
pbp['Shootout_Goals'] = ((pbp['Event'] == 'GOAL') & (pbp['Seconds_Elapsed']==0)).astype(np.int16)
pbp['Shots'] = ((pbp['Event'].isin(['SHOT','GOAL'])) & (pbp['Seconds_Elapsed']!=0)).astype(np.int16)
pbp['ShotAttempts'] = ((pbp['Event'].isin(['SHOT','MISS','GOAL','BLOCK'])) & (pbp['Seconds_Elapsed']!=0)).astype(np.int16)
pbp['Goals_5v5'] = ((pbp['Event'] == 'GOAL') & (pbp['Strength']=='5x5')).astype(np.int16)
pbp['Shots_5v5'] = ((pbp['Event'].isin(['SHOT','GOAL'])) & (pbp['Strength']=='5x5')).astype(np.int16)
pbp['ShotAttempts_5v5'] = ((pbp['Event'].isin(['SHOT','MISS','GOAL','BLOCK'])) & (pbp['Strength']=='5x5')).astype(np.int16)
pbp['xG_5v5'] = np.nan
pbp.loc[pbp['Strength']=='5x5', 'xG_5v5'] = pbp.loc[pbp['Strength']=='5x5']['xG']
pbp['Penalties'] = (pbp['Event'] == 'PENL').astype(np.int16)
pbp['Home'] = (pbp['Ev_Team']==pbp['Home_Team']).astype(np.int16)

# create aggregate
teamGame = pbp[[
        'Game_Id','Date','Ev_Team','Season','Goals','Shootout_Goals','Shots','ShotAttempts','xG','Goals_5v5','Shots_5v5','ShotAttempts_5v5','xG_5v5','Penalties','Home'
    ]].groupby([
        'Game_Id','Date','Ev_Team','Season'
    ]).agg({
        'Goals' : 'sum',
        'Shootout_Goals' : 'sum',
        'Shots' : 'sum',
        'ShotAttempts' : 'sum',
        'xG' : 'sum',
        'Goals_5v5' : 'sum',
        'Shots_5v5' : 'sum',
        'ShotAttempts_5v5' : 'sum',
        'xG_5v5' : 'sum',
        'Penalties' : 'sum',
        'Home' : 'max'
    }).reset_index()
teamGame = teamGame.rename(columns={'Ev_Team':'Team'})
teamGame = teamGame.fillna(0)

In [None]:
del pbp

In [None]:
# add win column
df_temp = teamGame.groupby(['Game_Id','Date']).agg({'Goals':'max', 'Shootout_Goals':'max'}).reset_index()
df_temp = df_temp.rename(columns={'Goals':'winGoals', 'Shootout_Goals':'winShootoutGoals'})
teamGame = teamGame.merge(df_temp, on=['Game_Id','Date'])
del df_temp
teamGame['Win'] = ((teamGame['Goals']==teamGame['winGoals']) & (teamGame['Shootout_Goals']==teamGame['winShootoutGoals'])).astype(np.int16)
teamGame = teamGame.drop(['Shootout_Goals','winGoals','winShootoutGoals'],1)

In [None]:
teamGame = teamGame.sort_values(by=['Date','Game_Id'])

In [None]:
# add field for whether this is a playoff game or not
teamGame['DateInt'] = teamGame['Date'].str.replace('-','').astype(np.int64)
teamGame['teamGameRank'] = teamGame.groupby(['Team','Season'])['DateInt'].rank("dense")
teamGame['Playoffs'] = (teamGame['teamGameRank']>82).astype(np.int8)
teamGame.loc[(teamGame['Season']==2012)&(teamGame['teamGameRank']>48),'Playoffs'] = 1 #fix for the lockout-shortened season
teamGame.loc[(teamGame['Season']==2019)&(teamGame['Date']>'2020-03-12'),'Playoffs'] = 1 #fix for the first covid-shortened season
teamGame.loc[(teamGame['Season']==2020)&(teamGame['teamGameRank']>56),'Playoffs'] = 1 #fix for the second covid-shortened season

In [None]:
teamGame = teamGame.loc[teamGame['Team'].str.len()==3]

In [None]:
teamGame.head()

Unnamed: 0,Game_Id,Date,Team,Season,Goals,Shots,ShotAttempts,xG,Goals_5v5,Shots_5v5,ShotAttempts_5v5,xG_5v5,Penalties,Home,Win,DateInt,teamGameRank,Playoffs
0,20001,2013-01-19,PHI,2012,1,27,54,2.759272,1,16,36,1.450979,3,1,0,20130119,1.0,0
1,20001,2013-01-19,PIT,2012,3,27,53,2.997073,1,23,44,1.536232,5,0,1,20130119,1.0,0
2,20002,2013-01-19,OTT,2012,4,37,61,2.33012,3,31,51,1.973215,7,0,1,20130119,1.0,0
3,20002,2013-01-19,WPG,2012,1,28,58,2.584469,0,20,46,1.749754,3,1,0,20130119,1.0,0
4,20003,2013-01-19,CHI,2012,5,22,42,2.299323,4,15,29,1.403502,5,0,1,20130119,1.0,0


In [None]:
teamGame = teamGame.sort_values(by=['DateInt','Game_Id'])
teamGame['teamGameRank'] = teamGame.groupby(['Team','Season'])['DateInt'].rank("dense")
teamGame['teamGameRankOverall'] = teamGame.groupby('Team')['DateInt'].rank("dense")
teamGame['Game_Id'] = (teamGame['Season'].astype(str) + teamGame['Game_Id'].astype(str)).astype(np.int64)

In [None]:
# add elo ratings
# https://fivethirtyeight.com/methodology/how-our-nhl-predictions-work/
teamGame = teamGame.sort_values(by=['DateInt','Game_Id'])
teamGame['Elo'] = np.nan
teamGame.loc[teamGame['teamGameRankOverall']==1,'Elo'] = 1500
for gid in teamGame['Game_Id'].unique():
    game_df = teamGame.loc[teamGame['Game_Id']==gid].copy()
    elo_1 = game_df['Elo'].iloc[0]
    elo_2 = game_df['Elo'].iloc[1]
    if game_df['teamGameRank'].iloc[0]==1:
        elo_1 = (elo_1*0.7) + (1505*0.3)
        teamGame.loc[(teamGame['Team']==game_df['Team'].iloc[0]) & (teamGame['teamGameRankOverall']==game_df['teamGameRankOverall'].iloc[0]), 'Elo'] = elo_1
    if game_df['teamGameRank'].iloc[1]==1:
        elo_2 = (elo_2*0.7) + (1505*0.3)
        teamGame.loc[(teamGame['Team']==game_df['Team'].iloc[1]) & (teamGame['teamGameRankOverall']==game_df['teamGameRankOverall'].iloc[1]), 'Elo'] = elo_2
    p_1 = 1/(10**((elo_2-elo_1)/400)+1)
    p_2 = 1/(10**((elo_1-elo_2)/400)+1)
    win_1 = game_df['Win'].iloc[0]
    win_2 = game_df['Win'].iloc[1]
    victoryMarginMultiplier_1 = 0.6686 * np.log(np.max([np.abs(game_df['Goals'].iloc[0]-game_df['Goals'].iloc[1]),1])) + 0.8048
    victoryMarginMultiplier_2 = 0.6686 * np.log(np.max([np.abs(game_df['Goals'].iloc[1]-game_df['Goals'].iloc[0]),1])) + 0.8048
    if elo_1>elo_2 and win_1==1:
        autoAdjust_1 = 2.05/((elo_1-elo_2) * 0.001 + 2.05)
        autoAdjust_2 = 1
    elif elo_2>elo_1 and win_2==1:
        autoAdjust_2 = 2.05/((elo_1-elo_2) * 0.001 + 2.05)
        autoAdjust_1 = 1
    else:
        autoAdjust_1 = 1
        autoAdjust_2 = 1
    favMultiplier_1 = win_1 - p_1
    favMultiplier_2 = win_2 - p_2
    new_elo_1 = elo_1 + (6*victoryMarginMultiplier_1*autoAdjust_1*favMultiplier_1)
    new_elo_2 = elo_2 + (6*victoryMarginMultiplier_2*autoAdjust_2*favMultiplier_2)
    teamGame.loc[(teamGame['Team']==game_df['Team'].iloc[0]) & (teamGame['teamGameRankOverall']==game_df['teamGameRankOverall'].iloc[0]+1), 'Elo'] = new_elo_1
    teamGame.loc[(teamGame['Team']==game_df['Team'].iloc[1]) & (teamGame['teamGameRankOverall']==game_df['teamGameRankOverall'].iloc[1]+1), 'Elo'] = new_elo_2

In [None]:
teamGame.tail()

Unnamed: 0,Game_Id,Date,Team,Season,Goals,Shots,ShotAttempts,xG,Goals_5v5,Shots_5v5,ShotAttempts_5v5,xG_5v5,Penalties,Home,Win,DateInt,Playoffs,Elo,teamGameRankOverall,teamGameRank
21913,202030413,2021-07-02,T.B,2020,6,30,48,3.494258,5,29,47,3.443062,1,0,1,20210702,1,1619.307462,779.0,76.0
21914,202030414,2021-07-05,MTL,2020,3,21,42,1.798614,3,14,33,1.235122,9,1,1,20210705,1,1500.216799,737.0,76.0
21915,202030414,2021-07-05,T.B,2020,2,34,70,3.891478,2,23,50,2.710742,6,0,0,20210705,1,1622.626713,780.0,77.0
21916,202030415,2021-07-07,MTL,2020,0,22,42,1.813529,0,21,39,1.519602,4,0,0,20210707,1,1503.448316,738.0,77.0
21917,202030415,2021-07-07,T.B,2020,1,30,47,2.513422,1,22,38,1.853019,4,1,1,20210707,1,1619.395196,781.0,78.0


In [None]:
teamGame.to_csv('teamGame.gz', compression='gzip', index=False)

# Feature Calculation

In [None]:
#playerGame['Game_Id'] = (playerGame['Season'].astype(str) + playerGame['Game_Id'].astype(str)).astype(np.int64)

In [None]:
playerGame = playerGame.sort_values(by=['Date','Team'])
playerGame['GameScore_last8'] = playerGame.groupby(['Player','PlayerID','Position'])[['GameScore']].transform(lambda x: x.rolling(window=8).mean()).fillna(0.)
playerGame['GameScore_last16'] = playerGame.groupby(['Player','PlayerID','Position'])[['GameScore']].transform(lambda x: x.rolling(window=16).mean()).fillna(0.)
playerGame['GameScore_last32'] = playerGame.groupby(['Player','PlayerID','Position'])[['GameScore']].transform(lambda x: x.rolling(window=32).mean()).fillna(0.)
playerGame['GameScore_last64'] = playerGame.groupby(['Player','PlayerID','Position'])[['GameScore']].transform(lambda x: x.rolling(window=64).mean()).fillna(0.)

In [None]:
playerTeamGame = playerGame.groupby(['Date','Team']).agg({
    'GameScore' : ['mean','median','max','std'],
    'GameScore_last8' : ['mean','median','max','std'],
    'GameScore_last16' : ['mean','median','max','std'],
    'GameScore_last32' : ['mean','median','max','std'],
    'GameScore_last64' : ['mean','median','max','std']
}).reset_index()
playerTeamGame.columns = ['Date','Team','GameScore_mean','GameScore_median','GameScore_max','GameScore_std',
                          'GameScore_last8_mean','GameScore_last8_median','GameScore_last8_max','GameScore_last8_std',
                          'GameScore_last16_mean','GameScore_last16_median','GameScore_last16_max','GameScore_last16_std',
                          'GameScore_last32_mean','GameScore_last32_median','GameScore_last32_max','GameScore_last32_std',
                          'GameScore_last64_mean','GameScore_last64_median','GameScore_last64_max','GameScore_last64_std'
                          ]
teamGame = teamGame.merge(playerTeamGame, on=['Date','Team'], how='left')

playerTeamGame = playerGame.loc[playerGame['Position']=='F'].groupby(['Date','Team']).agg({
    'GameScore' : ['mean','median','max','std'],
    'GameScore_last8' : ['mean','median','max','std'],
    'GameScore_last16' : ['mean','median','max','std'],
    'GameScore_last32' : ['mean','median','max','std'],
    'GameScore_last64' : ['mean','median','max','std']
}).reset_index()
playerTeamGame.columns = ['Date','Team','GameScoreF_mean','GameScoreF_median','GameScoreF_max','GameScoreF_std',
                          'GameScoreF_last8_mean','GameScoreF_last8_median','GameScoreF_last8_max','GameScoreF_last8_std',
                          'GameScoreF_last16_mean','GameScoreF_last16_median','GameScoreF_last16_max','GameScoreF_last16_std',
                          'GameScoreF_last32_mean','GameScoreF_last32_median','GameScoreF_last32_max','GameScoreF_last32_std',
                          'GameScoreF_last64_mean','GameScoreF_last64_median','GameScoreF_last64_max','GameScoreF_last64_std'
                          ]
teamGame = teamGame.merge(playerTeamGame, on=['Date','Team'], how='left')

playerTeamGame = playerGame.loc[playerGame['Position']=='D'].groupby(['Date','Team']).agg({
    'GameScore' : ['mean','median','max','std'],
    'GameScore_last8' : ['mean','median','max','std'],
    'GameScore_last16' : ['mean','median','max','std'],
    'GameScore_last32' : ['mean','median','max','std'],
    'GameScore_last64' : ['mean','median','max','std']
}).reset_index()
playerTeamGame.columns = ['Date','Team','GameScoreD_mean','GameScoreD_median','GameScoreD_max','GameScoreD_std',
                          'GameScoreD_last8_mean','GameScoreD_last8_median','GameScoreD_last8_max','GameScoreD_last8_std',
                          'GameScoreD_last16_mean','GameScoreD_last16_median','GameScoreD_last16_max','GameScoreD_last16_std',
                          'GameScoreD_last32_mean','GameScoreD_last32_median','GameScoreD_last32_max','GameScoreD_last32_std',
                          'GameScoreD_last64_mean','GameScoreD_last64_median','GameScoreD_last64_max','GameScoreD_last64_std'
                          ]
teamGame = teamGame.merge(playerTeamGame, on=['Date','Team'], how='left')

In [None]:
def _add_lag(df, cols, lag, groupCol):
    new_cols = [col+'_last'+str(lag) for col in cols]
    df[new_cols] = df.groupby(groupCol)[cols].transform(lambda x: x.rolling(window=lag).mean())
    df[new_cols] = df.groupby(groupCol)[new_cols].shift(1)

    return df

In [None]:
# add some lag features (rolling averages of past games)
train_df = _add_lag(teamGame.loc[teamGame['Playoffs']==0].copy(),
                    ['Goals','Shots','ShotAttempts','xG','Goals_5v5','Shots_5v5','ShotAttempts_5v5','xG_5v5',
                     'GameScore_mean','GameScore_median','GameScore_max','GameScore_std'], 8, 'Team')
train_df = _add_lag(train_df, ['Goals','Shots','ShotAttempts','xG','Goals_5v5','Shots_5v5','ShotAttempts_5v5','xG_5v5',
                     'GameScore_mean','GameScore_median','GameScore_max','GameScore_std'], 16, 'Team')
train_df = _add_lag(train_df, ['Goals','Shots','ShotAttempts','xG','Goals_5v5','Shots_5v5','ShotAttempts_5v5','xG_5v5',
                     'GameScore_mean','GameScore_median','GameScore_max','GameScore_std'], 32, 'Team')
train_df = _add_lag(train_df, ['Goals','Shots','ShotAttempts','xG','Goals_5v5','Shots_5v5','ShotAttempts_5v5','xG_5v5',
                     'GameScore_mean','GameScore_median','GameScore_max','GameScore_std'], 64, 'Team')

In [None]:
# shift GameScore columns
shift_cols = ['GameScore_last8_mean','GameScore_last8_median','GameScore_last8_max','GameScore_last8_std',
          'GameScore_last16_mean','GameScore_last16_median','GameScore_last16_max','GameScore_last16_std',
          'GameScore_last32_mean','GameScore_last32_median','GameScore_last32_max','GameScore_last32_std',
          'GameScore_last64_mean','GameScore_last64_median','GameScore_last64_max','GameScore_last64_std',
          'GameScoreF_last8_mean','GameScoreF_last8_median','GameScoreF_last8_max','GameScoreF_last8_std',
          'GameScoreF_last16_mean','GameScoreF_last16_median','GameScoreF_last16_max','GameScoreF_last16_std',
          'GameScoreF_last32_mean','GameScoreF_last32_median','GameScoreF_last32_max','GameScoreF_last32_std',
          'GameScoreF_last64_mean','GameScoreF_last64_median','GameScoreF_last64_max','GameScoreF_last64_std',
          'GameScoreD_last8_mean','GameScoreD_last8_median','GameScoreD_last8_max','GameScoreD_last8_std',
          'GameScoreD_last16_mean','GameScoreD_last16_median','GameScoreD_last16_max','GameScoreD_last16_std',
          'GameScoreD_last32_mean','GameScoreD_last32_median','GameScoreD_last32_max','GameScoreD_last32_std',
          'GameScoreD_last64_mean','GameScoreD_last64_median','GameScoreD_last64_max','GameScoreD_last64_std'
]
train_df[shift_cols] = train_df.groupby('Team')[shift_cols].shift(1)

In [None]:
def _add_player_lag(df, playerGame, lag):
    playerGame[['GameScore_last'+str(lag)]] = playerGame.groupby('PlayerID')[['GameScore']].fillna(0.).transform(lambda x: x.rolling(window=lag).mean())
    playerGame[['GameScore_last'+str(lag)]] = playerGame.groupby('PlayerID')[['GameScore_last'+str(lag)]].shift(1).fillna(0.)

    playerGame = playerGame.groupby(['Game_Id','Team']).agg({
        'GameScore_last'+str(lag) : ['mean','median','max','std']
    }).reset_index()
    playerGame.columns = ['Game_Id','Team','GameScore_mean_last'+str(lag),'GameScore_median_last'+str(lag),'GameScore_max_last'+str(lag),'GameScore_std_last'+str(lag)]
    df = df.merge(playerGame, how='left', on=['Game_Id','Team'])

    return df

In [None]:
#train_df = _add_player_lag(train_df, playerGame.loc[playerGame['Playoffs']==0].copy(), 8)
#train_df = _add_player_lag(train_df, playerGame.loc[playerGame['Playoffs']==0].copy(), 16)
#train_df = _add_player_lag(train_df, playerGame.loc[playerGame['Playoffs']==0].copy(), 32)
#train_df = _add_player_lag(train_df, playerGame.loc[playerGame['Playoffs']==0].copy(), 64)

In [None]:
# do a self-join to get features for the opposing team
train_df['teamGameVal'] = train_df.groupby(['Game_Id','Season','Date'])['xG'].rank("dense").replace(2, -1)
train_df_opp = train_df.copy()
train_df_opp['teamGameVal'] = train_df_opp['teamGameVal'] * -1
train_df_opp = train_df_opp.drop(['Win','DateInt','Playoffs','Home','teamGameRankOverall','teamGameRank'],1)
train_df_opp.columns = ['Game_Id','Date','Opp','Season',
                        'GoalsAgainst','ShotsAgainst','ShotAttemptsAgainst','xGAgainst',
                        'GoalsAgainst_5v5','ShotsAgainst_5v5','ShotAttemptsAgainst_5v5','xGAgainst_5v5','PenaltiesAgainst',
                        'OppElo','OppGameScore_mean','OppGameScore_median','OppGameScore_max','OppGameScore_std',
                        'OppGameScore_last8_mean','OppGameScore_last8_median','OppGameScore_last8_max','OppGameScore_last8_std',
                        'OppGameScore_last16_mean','OppGameScore_last16_median','OppGameScore_last16_max','OppGameScore_last16_std',
                        'OppGameScore_last32_mean','OppGameScore_last32_median','OppGameScore_last32_max','OppGameScore_last32_std',
                        'OppGameScore_last64_mean','OppGameScore_last64_median','OppGameScore_last64_max','OppGameScore_last64_std',
                        'OppGameScoreF_mean','OppGameScoreF_median','OppGameScoreF_max','OppGameScoreF_std',
                        'OppGameScoreF_last8_mean','OppGameScoreF_last8_median','OppGameScoreF_last8_max','OppGameScoreF_last8_std',
                        'OppGameScoreF_last16_mean','OppGameScoreF_last16_median','OppGameScoreF_last16_max','OppGameScoreF_last16_std',
                        'OppGameScoreF_last32_mean','OppGameScoreF_last32_median','OppGameScoreF_last32_max','OppGameScoreF_last32_std',
                        'OppGameScoreF_last64_mean','OppGameScoreF_last64_median','OppGameScoreF_last64_max','OppGameScoreF_last64_std',
                        'OppGameScoreD_mean','OppGameScoreD_median','OppGameScoreD_max','OppGameScoreD_std',
                        'OppGameScoreD_last8_mean','OppGameScoreD_last8_median','OppGameScoreD_last8_max','OppGameScoreD_last8_std',
                        'OppGameScoreD_last16_mean','OppGameScoreD_last16_median','OppGameScoreD_last16_max','OppGameScoreD_last16_std',
                        'OppGameScoreD_last32_mean','OppGameScoreD_last32_median','OppGameScoreD_last32_max','OppGameScoreD_last32_std',
                        'OppGameScoreD_last64_mean','OppGameScoreD_last64_median','OppGameScoreD_last64_max','OppGameScoreD_last64_std',
                        'OppGoals_last8','OppShots_last8','OppShotAttempts_last8','OppxG_last8',
                        'OppGoals_5v5_last8','OppShots_5v5_last8','OppShotAttempts_5v5_last8','OppxG_5v5_last8',
                        'OppGoals_last16','OppShots_last16','OppShotAttempts_last16','OppxG_last16',
                        'OppGoals_5v5_last16','OppShots_5v5_last16','OppShotAttempts_5v5_last16','OppxG_5v5_last16',
                        'OppGoals_last32','OppShots_last32','OppShotAttempts_last32','OppxG_last32',
                        'OppGoals_5v5_last32','OppShots_5v5_last32','OppShotAttempts_5v5_last32','OppxG_5v5_last32',
                        'OppGoals_last64','OppShots_last64','OppShotAttempts_last64','OppxG_last64',
                        'OppGoals_5v5_last64','OppShots_5v5_last64','OppShotAttempts_5v5_last64','OppxG_5v5_last64',
                        'OppGameScore_mean_last8','OppGameScore_median_last8','OppGameScore_max_last8','OppGameScore_std_last8',
                        'OppGameScore_mean_last16','OppGameScore_median_last16','OppGameScore_max_last16','OppGameScore_std_last16',
                        'OppGameScore_mean_last32','OppGameScore_median_last32','OppGameScore_max_last32','OppGameScore_std_last32',
                        'OppGameScore_mean_last64','OppGameScore_median_last64','OppGameScore_max_last64','OppGameScore_std_last64',
                        'teamGameVal'
                        ]
train_df = train_df.merge(train_df_opp, on=['Game_Id','Date','Season','teamGameVal'])
train_df = train_df.drop('teamGameVal',1)
train_df['EloDiff'] = train_df['Elo'] - train_df['OppElo']
train_df['EloDiff_538adj'] = (train_df['Elo'] - train_df['OppElo'] + (train_df['Home']*50) + ((train_df['Home']-1)*50))*(train_df['Playoffs']*0.25+1)

In [None]:
# add lag features for allowed stats
train_df = _add_lag(train_df, ['GoalsAgainst','ShotsAgainst','ShotAttemptsAgainst','xGAgainst',
                               'GoalsAgainst_5v5','ShotsAgainst_5v5','ShotAttemptsAgainst_5v5','xGAgainst_5v5'], 8, 'Team')
train_df = _add_lag(train_df, ['GoalsAgainst','ShotsAgainst','ShotAttemptsAgainst','xGAgainst',
                               'GoalsAgainst_5v5','ShotsAgainst_5v5','ShotAttemptsAgainst_5v5','xGAgainst_5v5'], 16, 'Team')
train_df = _add_lag(train_df, ['GoalsAgainst','ShotsAgainst','ShotAttemptsAgainst','xGAgainst',
                               'GoalsAgainst_5v5','ShotsAgainst_5v5','ShotAttemptsAgainst_5v5','xGAgainst_5v5'], 32, 'Team')
train_df = _add_lag(train_df, ['GoalsAgainst','ShotsAgainst','ShotAttemptsAgainst','xGAgainst',
                               'GoalsAgainst_5v5','ShotsAgainst_5v5','ShotAttemptsAgainst_5v5','xGAgainst_5v5'], 64, 'Team')

In [None]:
train_df['LastGame'] = train_df.groupby('Team')['Date'].shift(1)
train_df['RestDays'] = (pd.to_datetime(train_df['Date']) - pd.to_datetime(train_df['LastGame']))/np.timedelta64(1,'D')

In [None]:
train_df = train_df.merge(lines[['Season','Date','Team','Open','Close']], how='left', on=['Season','Date','Team'])

In [None]:
train_df['OpenImpliedPct'] = 0.
train_df['Open'] = train_df['Open'].astype(float)
train_df.loc[train_df['Open']<0, 'OpenImpliedPct'] = train_df.loc[train_df['Open']<0, 'Open']/(train_df.loc[train_df['Open']<0, 'Open']-100)
train_df.loc[train_df['Open']>0, 'OpenImpliedPct'] = 100/(100+train_df.loc[train_df['Open']>0, 'Open'])

train_df['CloseImpliedPct'] = 0.
train_df['Close'] = train_df['Close'].astype(float)
train_df.loc[train_df['Close']<0, 'CloseImpliedPct'] = train_df.loc[train_df['Close']<0, 'Close']/(train_df.loc[train_df['Close']<0, 'Close']-100)
train_df.loc[train_df['Close']>0, 'CloseImpliedPct'] = 100/(100+train_df.loc[train_df['Close']>0, 'Close'])

In [None]:
df_all = train_df.copy(deep=True)

#Current Model

In [None]:
features_set = ['Elo', 'GameScore_last8_mean',
       'GameScore_last8_median', 'GameScore_last8_max', 'GameScore_last8_std',
       'GameScore_last16_mean', 'GameScore_last16_median',
       'GameScore_last16_max', 'GameScore_last16_std', 'GameScore_last32_mean',
       'GameScore_last32_median', 'GameScore_last32_max',
       'GameScore_last32_std', 'GameScore_last64_mean',
       'GameScore_last64_median', 'GameScore_last64_max',
       'GameScore_last64_std', 'GameScoreF_last8_mean',
       'GameScoreF_last8_median', 'GameScoreF_last8_max',
       'GameScoreF_last8_std', 'GameScoreF_last16_mean',
       'GameScoreF_last16_median','GameScoreF_last16_max', 'GameScoreF_last16_std',
       'GameScoreF_last32_mean', 'GameScoreF_last32_median',
       'GameScoreF_last32_max', 'GameScoreF_last32_std',
       'GameScoreF_last64_mean', 'GameScoreF_last64_median',
       'GameScoreF_last64_max', 'GameScoreF_last64_std',
       'GameScoreD_last8_mean', 'GameScoreD_last8_median',
       'GameScoreD_last8_max', 'GameScoreD_last8_std',
       'GameScoreD_last16_mean', 'GameScoreD_last16_median',
       'GameScoreD_last16_max', 'GameScoreD_last16_std',
       'GameScoreD_last32_mean', 'GameScoreD_last32_median',
       'GameScoreD_last32_max', 'GameScoreD_last32_std',
       'GameScoreD_last64_mean', 'GameScoreD_last64_median',
       'GameScoreD_last64_max', 'GameScoreD_last64_std', 'Goals_last8',
       'Shots_last8', 'ShotAttempts_last8', 'xG_last8', 'Goals_5v5_last8',
       'Shots_5v5_last8', 'ShotAttempts_5v5_last8', 'xG_5v5_last8',
       'GameScore_mean_last8', 'GameScore_median_last8', 'GameScore_max_last8',
       'GameScore_std_last8', 'Goals_last16', 'Shots_last16',
       'ShotAttempts_last16', 'xG_last16', 'Goals_5v5_last16',
       'Shots_5v5_last16', 'ShotAttempts_5v5_last16', 'xG_5v5_last16',
       'GameScore_mean_last16', 'GameScore_median_last16',
       'GameScore_max_last16', 'GameScore_std_last16', 'Goals_last32',
       'Shots_last32', 'ShotAttempts_last32', 'xG_last32', 'Goals_5v5_last32',
       'Shots_5v5_last32', 'ShotAttempts_5v5_last32', 'xG_5v5_last32',
       'GameScore_mean_last32', 'GameScore_median_last32',
       'GameScore_max_last32', 'GameScore_std_last32', 'Goals_last64',
       'Shots_last64', 'ShotAttempts_last64', 'xG_last64', 'Goals_5v5_last64',
       'Shots_5v5_last64', 'ShotAttempts_5v5_last64', 'xG_5v5_last64',
       'GameScore_mean_last64', 'GameScore_median_last64',
       'GameScore_max_last64', 'GameScore_std_last64', 'OppElo', 
       'OppGameScore_last8_mean', 'OppGameScore_last8_median',
       'OppGameScore_last8_max', 'OppGameScore_last8_std',
       'OppGameScore_last16_mean', 'OppGameScore_last16_median',
       'OppGameScore_last16_max', 'OppGameScore_last16_std', 'OppGameScore_last32_mean',
       'OppGameScore_last32_median', 'OppGameScore_last32_max',
       'OppGameScore_last32_std', 'OppGameScore_last64_mean',
       'OppGameScore_last64_median', 'OppGameScore_last64_max',
       'OppGameScore_last64_std', 'OppGameScoreF_last8_mean',
       'OppGameScoreF_last8_median', 'OppGameScoreF_last8_max',
       'OppGameScoreF_last8_std', 'OppGameScoreF_last16_mean',
       'OppGameScoreF_last16_median', 'OppGameScoreF_last16_max',
       'OppGameScoreF_last16_std', 'OppGameScoreF_last32_mean',
       'OppGameScoreF_last32_median', 'OppGameScoreF_last32_max',
       'OppGameScoreF_last32_std', 'OppGameScoreF_last64_mean',
       'OppGameScoreF_last64_median', 'OppGameScoreF_last64_max',
       'OppGameScoreF_last64_std', 'OppGameScoreD_last8_mean', 'OppGameScoreD_last8_median',
       'OppGameScoreD_last8_max', 'OppGameScoreD_last8_std',
       'OppGameScoreD_last16_mean', 'OppGameScoreD_last16_median',
       'OppGameScoreD_last16_max', 'OppGameScoreD_last16_std',
       'OppGameScoreD_last32_mean', 'OppGameScoreD_last32_median',
       'OppGameScoreD_last32_max', 'OppGameScoreD_last32_std',
       'OppGameScoreD_last64_mean', 'OppGameScoreD_last64_median',
       'OppGameScoreD_last64_max', 'OppGameScoreD_last64_std',
       'OppGoals_last8', 'OppShots_last8', 'OppShotAttempts_last8', 'OppxG_last8',
       'OppGoals_5v5_last8', 'OppShots_5v5_last8', 'OppShotAttempts_5v5_last8',
       'OppxG_5v5_last8', 'OppGoals_last16', 'OppShots_last16',
       'OppShotAttempts_last16', 'OppxG_last16', 'OppGoals_5v5_last16',
       'OppShots_5v5_last16', 'OppShotAttempts_5v5_last16', 'OppxG_5v5_last16',
       'OppGoals_last32', 'OppShots_last32', 'OppShotAttempts_last32',
       'OppxG_last32', 'OppGoals_5v5_last32', 'OppShots_5v5_last32',
       'OppShotAttempts_5v5_last32', 'OppxG_5v5_last32', 'OppGoals_last64',
       'OppShots_last64', 'OppShotAttempts_last64', 'OppxG_last64',
       'OppGoals_5v5_last64', 'OppShots_5v5_last64',
       'OppShotAttempts_5v5_last64', 'OppxG_5v5_last64',
       'OppGameScore_mean_last8', 'OppGameScore_median_last8',
       'OppGameScore_max_last8', 'OppGameScore_std_last8',
       'OppGameScore_mean_last16', 'OppGameScore_median_last16',
       'OppGameScore_max_last16', 'OppGameScore_std_last16',
       'OppGameScore_mean_last32', 'OppGameScore_median_last32',
       'OppGameScore_max_last32', 'OppGameScore_std_last32',
       'OppGameScore_mean_last64', 'OppGameScore_median_last64',
       'OppGameScore_max_last64', 'OppGameScore_std_last64', 'EloDiff',
       'EloDiff_538adj', 'GoalsAgainst_last8', 'ShotsAgainst_last8', 'ShotAttemptsAgainst_last8', 'xGAgainst_last8',
       'GoalsAgainst_5v5_last8', 'ShotsAgainst_5v5_last8',
       'ShotAttemptsAgainst_5v5_last8', 'xGAgainst_5v5_last8',
       'GoalsAgainst_last16', 'ShotsAgainst_last16',
       'ShotAttemptsAgainst_last16', 'xGAgainst_last16',
       'GoalsAgainst_5v5_last16', 'ShotsAgainst_5v5_last16',
       'ShotAttemptsAgainst_5v5_last16', 'xGAgainst_5v5_last16',
       'GoalsAgainst_last32', 'ShotsAgainst_last32',
       'ShotAttemptsAgainst_last32', 'xGAgainst_last32',
       'GoalsAgainst_5v5_last32', 'ShotsAgainst_5v5_last32',
       'ShotAttemptsAgainst_5v5_last32', 'xGAgainst_5v5_last32',
       'GoalsAgainst_last64', 'ShotsAgainst_last64',
       'ShotAttemptsAgainst_last64', 'xGAgainst_last64',
       'GoalsAgainst_5v5_last64', 'ShotsAgainst_5v5_last64',
       'ShotAttemptsAgainst_5v5_last64', 'xGAgainst_5v5_last64',
       'RestDays', 'Close', 'CloseImpliedPct'
       ]

In [None]:
train_df = df_all.loc[df_all['Home']==1]
curFeatures_logloss = 0.6733682725936084
best_logloss = 0.6733682725936084

curFeatures = ['Open','OpenImpliedPct']

improving = True

while improving:
    improving = False
    for c in features_set:
        if c not in curFeatures:
            features = curFeatures+[c]
            train_df_loop = train_df.loc[~train_df[features].isnull().max(1)]

            [X_train_df, X_test_df, y_train_df, y_test_df] = train_test_split(train_df_loop, train_df_loop[['Win']], test_size=0.3, random_state=26)
            y_train = y_train_df['Win'].values
            y_test = y_test_df['Win'].values

            X_train = X_train_df[features].values
            X_test = X_test_df[features].values

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            X = np.concatenate((X_train, X_test))
            y = np.concatenate((y_train, y_test))

            model_lr = LogisticRegression(max_iter=1000)
            model_lr.fit(X_train, y_train)
            preds_lr = model_lr.predict_proba(X_test)[:,1]
            logloss = log_loss(y_test, preds_lr)

            if logloss<curFeatures_logloss:
                improving=True
                if logloss<best_logloss:
                    best_feature = c
                    best_logloss = logloss

    if improving:
        curFeatures.append(best_feature)
        curFeatures_logloss = best_logloss

    print (curFeatures)
    print (curFeatures_logloss)

['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8']
0.6658297293367441
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16']
0.6629198075747901
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct']
0.6607783389027913
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct', 'GameScoreF_last32_median']
0.660327161456636
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8']
0.6600429241821676
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8', 'GoalsAgainst_5v5_last8']
0.6597906406997897
['Open', 'OpenImpliedPct', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8', 'GoalsAgainst_5v5_last8',

In [None]:
curFeatures = ['ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'CloseImpliedPct', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8', 'GoalsAgainst_5v5_last8', 'GameScoreD_last16_mean', 'OppShotAttempts_last32', 'ShotsAgainst_last8', 'EloDiff', 'OppShotAttempts_5v5_last8', 'ShotAttempts_last8', 'GameScore_last8_max', 'GameScoreF_last8_max', 'GameScoreD_last8_max', 'RestDays', 'GameScore_last64_median', 'GameScore_last64_std', 'GameScoreF_last64_std', 'GameScoreF_last16_mean', 'OppShots_5v5_last8', 'OppShots_last32', 'EloDiff_538adj']

In [None]:
curFeatures_logloss = 0.6580631981225172
best_logloss = 0.6580631981225172

#curFeatures = ['Open','OpenImpliedPct']

removed = []

improving = True

while improving:
    improving = False
    for c in curFeatures:
        if c not in removed:
            features = ['Open','OpenImpliedPct'] + curFeatures.copy()
            features.remove(c)
            train_df_loop = train_df.loc[~train_df[features].isnull().max(1)]

            [X_train_df, X_test_df, y_train_df, y_test_df] = train_test_split(train_df_loop, train_df_loop[['Win']], test_size=0.3, random_state=26)
            y_train = y_train_df['Win'].values
            y_test = y_test_df['Win'].values

            X_train = X_train_df[features].values
            X_test = X_test_df[features].values

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            X = np.concatenate((X_train, X_test))
            y = np.concatenate((y_train, y_test))

            model_lr = LogisticRegression(max_iter=1000)
            model_lr.fit(X_train, y_train)
            preds_lr = model_lr.predict_proba(X_test)[:,1]
            logloss = log_loss(y_test, preds_lr)

            if logloss<curFeatures_logloss:
                improving=True
                if logloss<best_logloss:
                    worst_feature = c
                    best_logloss = logloss

    if improving:
        curFeatures.remove(worst_feature)
        curFeatures_logloss = best_logloss

    print (curFeatures)
    print (curFeatures_logloss)

curFeatures = ['Open','OpenImpliedPct'] + curFeatures

['Open', 'ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8', 'GoalsAgainst_5v5_last8', 'OppShotAttempts_last32', 'ShotsAgainst_last8', 'EloDiff', 'OppShotAttempts_5v5_last8', 'ShotAttempts_last8', 'GameScore_last8_max', 'GameScoreF_last8_max', 'GameScoreD_last8_max', 'RestDays', 'GameScore_last64_median', 'GameScore_last64_std', 'GameScoreF_last64_std', 'GameScoreF_last16_mean', 'OppShots_5v5_last8', 'OppShots_last32', 'EloDiff_538adj', 'CloseImpliedPct']
0.657972967059018
['ShotAttempts_5v5_last8', 'OppShotAttempts_5v5_last16', 'GameScoreF_last32_median', 'ShotsAgainst_5v5_last8', 'GoalsAgainst_5v5_last8', 'OppShotAttempts_last32', 'ShotsAgainst_last8', 'EloDiff', 'OppShotAttempts_5v5_last8', 'ShotAttempts_last8', 'GameScore_last8_max', 'GameScoreF_last8_max', 'GameScoreD_last8_max', 'RestDays', 'GameScore_last64_median', 'GameScore_last64_std', 'GameScoreF_last64_std', 'GameScoreF_last16_mean', 'OppShots_5v5_last8', 'OppShots_

In [None]:
curFeatures

['Open',
 'OpenImpliedPct',
 'ShotAttempts_5v5_last8',
 'OppShotAttempts_5v5_last16',
 'GameScoreF_last32_median',
 'ShotsAgainst_5v5_last8',
 'GoalsAgainst_5v5_last8',
 'OppShotAttempts_last32',
 'ShotsAgainst_last8',
 'EloDiff',
 'OppShotAttempts_5v5_last8',
 'ShotAttempts_last8',
 'GameScore_last8_max',
 'GameScoreF_last8_max',
 'GameScoreD_last8_max',
 'RestDays',
 'GameScore_last64_median',
 'GameScore_last64_std',
 'GameScoreF_last64_std',
 'GameScoreF_last16_mean',
 'OppShots_5v5_last8',
 'OppShots_last32',
 'EloDiff_538adj',
 'CloseImpliedPct']

In [None]:
# remove rows without enough prior data
#train_df = train_df.loc[~((train_df['xG_last64'].isnull()) | (train_df['OppxG_last64'].isnull()))]
#train_df = df_all.loc[(df_all['Home']==1)&(df_all['Season']<2021)]
train_df = df_all.loc[df_all['Home']==1]
features = ['Open','OpenImpliedPct']
#features = curFeatures


train_df = train_df.loc[~train_df[features].isnull().max(1)]

# train/test split
[X_train_df, X_test_df, y_train_df, y_test_df] = train_test_split(train_df, train_df[['Win']], test_size=0.3, random_state=26)
y_train = y_train_df['Win'].values
y_test = y_test_df['Win'].values


#X_train_df = train_df.loc[train_df['Season']<2020].copy()
#X_test_df = train_df.loc[train_df['Season'].isin([2020])].copy()
#y_train = train_df.loc[train_df['Season']<2020]['Win'].values
#y_test = train_df.loc[train_df['Season'].isin([2020])]['Win'].values

X_train = X_train_df[features].values
X_test = X_test_df[features].values

# apply scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X = np.concatenate((X_train, X_test))
y = np.concatenate((y_train, y_test))

In [None]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
preds_lr = model_lr.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr))))
#V3 0.665534621181465, train_test_split
#v4 0.6558728278825735, train pre-2020, test on 2020
#v4 0.6720558119637386, train_test_split, with all through dec 21 2021

LogLoss score: 0.6733682725936084
AUC score: 0.6054777083550505


In [None]:
adj_preds_lr = ((preds_lr - 0.5) * 1.28) + 0.5
print ('LogLoss score: {}'.format(str(log_loss(y_test, adj_preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, adj_preds_lr))))
#0.6679878762459884

LogLoss score: 0.6772394272790156
AUC score: 0.6054777083550505


In [None]:
X_test_df['winPct'] = preds_lr
X_test_df['binned_winPct'] = np.round(X_test_df['winPct'], 1)
X_test_df[['binned_winPct','winPct','Win']]\
    .groupby(['binned_winPct']).agg({'winPct':['mean'],'Win':['count','mean']})

Unnamed: 0_level_0,winPct,Win,Win
Unnamed: 0_level_1,mean,count,mean
binned_winPct,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0.2,0.234207,5,0.6
0.3,0.317444,79,0.303797
0.4,0.412217,436,0.394495
0.5,0.500988,1119,0.499553
0.6,0.595013,1008,0.628968
0.7,0.686583,386,0.735751
0.8,0.775797,31,0.967742


In [None]:
df_2021 = df_all.loc[(df_all['Home']==1)&(df_all['Season']==2021)]
df_2021 = df_2021.loc[~df_2021[features].isnull().max(1)]
y_2021 = df_2021['Win'].values
X_2021 = scaler.transform(df_2021[features].values)

In [None]:
preds_2021 = model_lr.predict_proba(X_2021)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_2021, preds_2021))))
print ('AUC score: {}'.format(str(roc_auc_score(y_2021, preds_2021))))

LogLoss score: 0.6452322068652986
AUC score: 0.6620987125263178


In [None]:
full_train = train_df.loc[train_df['Season']<2021][features].values
full_y = train_df.loc[train_df['Season']<2021]['Win'].values
scaler = StandardScaler()
full_train = scaler.fit_transform(full_train)
model = LogisticRegression(max_iter=2000)
model.fit(full_train, full_y)
preds_lr = model.predict_proba(scaler.transform(X_test_df[features].values))[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr))))

LogLoss score: 0.6729674466071449
AUC score: 0.6061133443656834


In [None]:
preds_2021 = model.predict_proba(X_2021)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_2021, preds_2021))))
print ('AUC score: {}'.format(str(roc_auc_score(y_2021, preds_2021))))

LogLoss score: 0.6472410805787999
AUC score: 0.6616329116282537


In [None]:
df_2021['winPct'] = preds_2021
df_2021['OppWinPct'] = 1 - df_2021['winPct']

In [None]:
pickle.dump(model, open('lr.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

#CNN of Game Scores

In [None]:
df_gameScores = train_df[['Team','Date','Opp','Season','Win']]

In [None]:
#get game scores by game and position
temp_df = playerGame.loc[playerGame['Position']=='F'][['Team','Date','GameScore']].groupby(['Team','Date'])['GameScore'].apply(np.array).reset_index()
temp_df['count'] = temp_df['GameScore'].apply(lambda x: len(x))
temp_df['GameScore'] = temp_df['GameScore'].apply(lambda x: np.sort(x))
temp_df = temp_df.loc[temp_df['count']>=1]
temp_df.loc[temp_df['count']>12, 'GameScore'] = temp_df.loc[temp_df['count']>12, 'GameScore'].apply(lambda x: x[:12])
temp_df['GameScore'] = temp_df['GameScore'].apply(lambda x: np.pad(x, (0,12-len(x)))[:12])
temp_df = temp_df.drop('count',1)

#get previous 32 games of game scores
temp_df['GameScoresF_last32'] = np.nan
for t in temp_df['Team'].unique():
    X_gameScores = []
    team_df = temp_df.loc[temp_df['Team']==t,:]
    if len(team_df.index)>32:
        for i in range(32,len(team_df.index)):
            X_gameScores.append(np.stack(team_df.iloc[i-32:i, :].loc[:,'GameScore'].values))
        temp_df.loc[temp_df['Team']==t,'GameScoresF_last32'] = [np.nan]*32 + X_gameScores

#join to original data
temp_df = temp_df[['Team','Date','GameScoresF_last32']].copy()
df_gameScores = df_gameScores.merge(temp_df, on=['Team','Date'], how='left')
temp_df.columns = ['Opp','Date','OppGameScoresF_last32']
df_gameScores = df_gameScores.merge(temp_df, on=['Opp','Date'], how='left')

  return array(a, dtype, copy=False, order=order)
  dtype, _ = maybe_promote(np.array(value).dtype)
  arr_value = np.array(value)


In [None]:
#get game scores by game and position
temp_df = playerGame.loc[playerGame['Position']=='D'][['Team','Date','GameScore']].groupby(['Team','Date'])['GameScore'].apply(np.array).reset_index()
temp_df['count'] = temp_df['GameScore'].apply(lambda x: len(x))
temp_df['GameScore'] = temp_df['GameScore'].apply(lambda x: np.sort(x))
temp_df = temp_df.loc[temp_df['count']>=1]
temp_df.loc[temp_df['count']>6, 'GameScore'] = temp_df.loc[temp_df['count']>6, 'GameScore'].apply(lambda x: x[:6])
temp_df['GameScore'] = temp_df['GameScore'].apply(lambda x: np.pad(x, (0,6-len(x)))[:6])
temp_df = temp_df.drop('count',1)

#get previous 32 games of game scores
temp_df['GameScoresD_last32'] = np.nan
for t in temp_df['Team'].unique():
    X_gameScores = []
    team_df = temp_df.loc[temp_df['Team']==t,:]
    if len(team_df.index)>32:
        for i in range(32,len(team_df.index)):
            X_gameScores.append(np.stack(team_df.iloc[i-32:i, :].loc[:,'GameScore'].values))
        temp_df.loc[temp_df['Team']==t,'GameScoresD_last32'] = [np.nan]*32 + X_gameScores

#join to original data
temp_df = temp_df[['Team','Date','GameScoresD_last32']].copy()
df_gameScores = df_gameScores.merge(temp_df, on=['Team','Date'], how='left')
temp_df.columns = ['Opp','Date','OppGameScoresD_last32']
df_gameScores = df_gameScores.merge(temp_df, on=['Opp','Date'], how='left')

  return array(a, dtype, copy=False, order=order)
  dtype, _ = maybe_promote(np.array(value).dtype)
  arr_value = np.array(value)


In [None]:
features_nn = ['GameScoresF_last32','GameScoresD_last32','OppGameScoresF_last32','OppGameScoresD_last32']
train_df_temp = df_gameScores.loc[~df_gameScores[features_nn].isnull().max(1)]

X_F_train = np.stack(train_df_temp.loc[train_df_temp['Season']<2019]['GameScoresF_last32'].values)
X_D_train = np.stack(train_df_temp.loc[train_df_temp['Season']<2019]['GameScoresD_last32'].values)
X_F_Opp_train = np.stack(train_df_temp.loc[train_df_temp['Season']<2019]['OppGameScoresF_last32'].values)
X_D_Opp_train = np.stack(train_df_temp.loc[train_df_temp['Season']<2019]['OppGameScoresD_last32'].values)
y_nn_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values

#X_F_train = np.concatenate([np.expand_dims(X_F_train, 3), np.expand_dims(X_F_Opp_train, 3)], axis=3)
#X_D_train = np.concatenate([np.expand_dims(X_D_train, 3), np.expand_dims(X_D_Opp_train, 3)], axis=3)

X_F_test = np.stack(train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['GameScoresF_last32'].values)
X_D_test = np.stack(train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['GameScoresD_last32'].values)
X_F_Opp_test = np.stack(train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['OppGameScoresF_last32'].values)
X_D_Opp_test = np.stack(train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['OppGameScoresD_last32'].values)
y_nn_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

#X_F_test = np.concatenate([np.expand_dims(X_F_test, 3), np.expand_dims(X_F_Opp_test, 3)], axis=3)
#X_D_test = np.concatenate([np.expand_dims(X_D_test, 3), np.expand_dims(X_D_Opp_test, 3)], axis=3)

In [None]:
def create_model_nn(F_dim, D_dim):
    #forward channel
    inputsF = Input(shape=F_dim, name='F')
    conv1 = Conv2D(1, kernel_size=(8,1), activation='relu')(inputsF)
    pool1 = AveragePooling2D(pool_size=(4,1))(conv1)
    drop1 = Dropout(0.25)(pool1)
    conv2 = Conv2D(1, kernel_size=(2,2), activation='relu')(drop1)
    pool2 = AveragePooling2D(pool_size=(4,2))(conv2)
    drop2 = Dropout(0.25)(pool2)
    flatF = Flatten()(drop2)

    #defence channel
    inputsD = Input(shape=D_dim, name='D')
    conv3 = Conv2D(1, kernel_size=(8,1), activation='relu')(inputsD)
    pool3 = AveragePooling2D(pool_size=(4,1))(conv3)
    drop3 = Dropout(0.25)(pool3)
    conv4 = Conv2D(1, kernel_size=(2,2), activation='relu')(drop3)
    pool4 = AveragePooling2D(pool_size=(4,2))(conv4)
    drop4 = Dropout(0.25)(pool4)
    flatD = Flatten()(drop4)

    #forward channel opponents
    inputsF_opp = Input(shape=F_dim, name='F_opp')
    conv1_opp = Conv2D(1, kernel_size=(8,1), activation='relu')(inputsF_opp)
    pool1_opp = AveragePooling2D(pool_size=(4,1))(conv1_opp)
    drop1_opp = Dropout(0.25)(pool1_opp)
    conv2_opp = Conv2D(1, kernel_size=(2,2), activation='relu')(drop1_opp)
    pool2_opp = AveragePooling2D(pool_size=(4,2))(conv2_opp)
    drop2_opp = Dropout(0.25)(pool2_opp)
    flatF_opp = Flatten()(drop2_opp)

    #defence channel opponents
    inputsD_opp = Input(shape=D_dim, name='D_opp')
    conv3_opp = Conv2D(1, kernel_size=(8,1), activation='relu')(inputsD_opp)
    pool3_opp = AveragePooling2D(pool_size=(4,1))(conv3_opp)
    drop3_opp = Dropout(0.25)(pool3_opp)
    conv4_opp = Conv2D(1, kernel_size=(2,2), activation='relu')(drop3_opp)
    pool4_opp = AveragePooling2D(pool_size=(4,2))(conv4_opp)
    drop4_opp = Dropout(0.25)(pool4_opp)
    flatD_opp = Flatten()(drop4_opp)

    #merge
    merged = concatenate([flatF,flatD,flatF_opp,flatD_opp])

    #final layer
    dense1 = Dense(8, activation='relu')(merged)
    dense2 = Dense(8, activation='relu')(dense1)
    outputs = Dense(1, activation='sigmoid')(dense1)

    #compile
    model = Model(inputs=[inputsF, inputsD, inputsF_opp, inputsD_opp], outputs=outputs)
    model.compile(loss='binary_crossentropy')

    return model

In [None]:
model_nn = create_model_nn((32,12,1), (32,6,1))
model_nn.fit([X_F_train,X_D_train,X_F_Opp_train,X_D_Opp_train], y_nn_train, verbose=0, epochs=100, batch_size=256)
preds_nn = model_nn.predict([X_F_test,X_D_test,X_F_Opp_test,X_D_Opp_test])[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_nn_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_nn_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))

LogLoss score: 0.6836144541210107
AUC score: 0.5792617829328774


In [None]:
X_F_meta = np.stack(train_df_temp.loc[train_df_temp['Season']>=2019]['GameScoresF_last32'].values)
X_D_meta = np.stack(train_df_temp.loc[train_df_temp['Season']>=2019]['GameScoresD_last32'].values)
X_F_Opp_meta = np.stack(train_df_temp.loc[train_df_temp['Season']>=2019]['OppGameScoresF_last32'].values)
X_D_Opp_meta = np.stack(train_df_temp.loc[train_df_temp['Season']>=2019]['OppGameScoresD_last32'].values)

preds_nn_meta = model_nn.predict([X_F_meta,X_D_meta,X_F_Opp_meta,X_D_Opp_meta])[:,0]

df_nn_meta_preds = train_df_temp.loc[train_df_temp['Season']>=2019][['Date','Team']]
df_nn_meta_preds['preds_nn_meta'] = preds_nn_meta

#Ensemble

In [None]:
train_df = df_all.loc[df_all['Home']==1]

In [None]:
features_elo = ['EloDiff','EloDiff_538adj','Elo','OppElo']

# train/test split
train_df_temp = train_df.loc[~train_df[features_elo].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2019][features_elo].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])][features_elo].values
y_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

# apply scaler
scaler_elo = StandardScaler()
X_train = scaler_elo.fit_transform(X_train)
X_test = scaler_elo.transform(X_test)

lr_elo = LogisticRegression(max_iter=1000)
lr_elo.fit(X_train, y_train)
preds_lr_elo = lr_elo.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr_elo))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr_elo))))
#0.6711898384613472

LogLoss score: 0.6711898384613472
AUC score: 0.6158138320898373


In [None]:
features_last8 = []
for c in train_df.columns:
    if 'last8' in c:
        features_last8.append(c)

# train/test split
train_df_temp = train_df.loc[~train_df[features_last8].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2019][features_last8].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])][features_last8].values
y_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

# apply scaler
scaler_xgb_last8 = StandardScaler()
X_train = scaler_xgb_last8.fit_transform(X_train)
X_test = scaler_xgb_last8.transform(X_test)

xgb_last8 = GradientBoostingClassifier(max_depth=1)
xgb_last8.fit(X_train, y_train)
preds_xgb_last8 = xgb_last8.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_xgb_last8))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_xgb_last8))))
#0.6804475734308898

LogLoss score: 0.6804475734308898
AUC score: 0.5814696264861463


In [None]:
features_last16 = []
for c in train_df.columns:
    if 'last16' in c:
        features_last16.append(c)

# train/test split
train_df_temp = train_df.loc[~train_df[features_last16].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2019][features_last16].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])][features_last16].values
y_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

# apply scaler
scaler_xgb_last16 = StandardScaler()
X_train = scaler_xgb_last16.fit_transform(X_train)
X_test = scaler_xgb_last16.transform(X_test)

xgb_last16 = GradientBoostingClassifier(max_depth=1)
xgb_last16.fit(X_train, y_train)
preds_xgb_last16 = xgb_last16.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_xgb_last16))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_xgb_last16))))
#0.6766852089235553

LogLoss score: 0.6766852089235553
AUC score: 0.5928751084312758


In [None]:
features_last32 = []
for c in train_df.columns:
    if 'last32' in c:
        features_last32.append(c)

# train/test split
train_df_temp = train_df.loc[~train_df[features_last32].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2019][features_last32].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])][features_last32].values
y_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

# apply scaler
scaler_xgb_last32 = StandardScaler()
X_train = scaler_xgb_last32.fit_transform(X_train)
X_test = scaler_xgb_last32.transform(X_test)

xgb_last32 = GradientBoostingClassifier(max_depth=1)
xgb_last32.fit(X_train, y_train)
preds_xgb_last32 = xgb_last32.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_xgb_last32))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_xgb_last32))))
#0.6770797861560187

LogLoss score: 0.6770797861560187
AUC score: 0.5941231311550694


In [None]:
features_last64 = []
for c in train_df.columns:
    if 'last64' in c:
        features_last64.append(c)

# train/test split
train_df_temp = train_df.loc[~train_df[features_last64].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2019][features_last64].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])][features_last64].values
y_train = train_df_temp.loc[train_df_temp['Season']<2019]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2019,2020])]['Win'].values

# apply scaler
scaler_xgb_last64 = StandardScaler()
X_train = scaler_xgb_last64.fit_transform(X_train)
X_test = scaler_xgb_last64.transform(X_test)

xgb_last64 = GradientBoostingClassifier(max_depth=1)
xgb_last64.fit(X_train, y_train)
preds_xgb_last64 = xgb_last64.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_xgb_last64))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_xgb_last64))))
#0.6756582208556394

LogLoss score: 0.6756582208556394
AUC score: 0.603926594152365


In [None]:
features_lines = ['Open','OpenImpliedPct']

# train/test split
train_df_temp = train_df.loc[~train_df[features_lines].isnull().max(1)]
X_train = train_df_temp.loc[train_df_temp['Season']<2021][features_lines].values
X_test = train_df_temp.loc[train_df_temp['Season'].isin([2021])][features_lines].values
y_train = train_df_temp.loc[train_df_temp['Season']<2021]['Win'].values
y_test = train_df_temp.loc[train_df_temp['Season'].isin([2021])]['Win'].values

# apply scaler
scaler_line = StandardScaler()
X_train = scaler_line.fit_transform(X_train)
X_test = scaler_line.transform(X_test)

lr_line = LogisticRegression(max_iter=1000)
lr_line.fit(X_train, y_train)
preds_lr_line = lr_line.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr_line))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr_line))))
#0.6706709685264426

LogLoss score: 0.6474150197825156
AUC score: 0.6616329116282537


In [None]:
pickle.dump(lr_line, open('lr_line.pkl', 'wb'))
pickle.dump(scaler_line, open('scaler_line.pkl', 'wb'))

In [None]:
#combine models
features = features_elo+features_last8+features_last16+features_last32+features_last64+features_lines
train_df_temp = train_df.loc[~train_df[features].isnull().max(1)]
X_meta_df = train_df_temp.loc[train_df_temp['Season']>=2019]

preds_elo = lr_elo.predict_proba(scaler_elo.transform(X_meta_df[features_elo].values))[:,1]
preds_last8 = xgb_last8.predict_proba(scaler_xgb_last8.transform(X_meta_df[features_last8].values))[:,1]
preds_last16 = xgb_last16.predict_proba(scaler_xgb_last16.transform(X_meta_df[features_last16].values))[:,1]
preds_last32 = xgb_last32.predict_proba(scaler_xgb_last32.transform(X_meta_df[features_last32].values))[:,1]
preds_last64 = xgb_last64.predict_proba(scaler_xgb_last64.transform(X_meta_df[features_last64].values))[:,1]
preds_line = lr_line.predict_proba(scaler_line.transform(X_meta_df[features_lines].values))[:,1]
temp_df = X_meta_df.merge(df_nn_meta_preds, on=['Date','Team'], how='left')
preds_nn = temp_df['preds_nn_meta'].values

X_meta_df = X_meta_df[['Game_Id','Date','Team','Season','Opp','Win','Elo','OppElo','OpenImpliedPct']]
X_meta_df['winPct_elo'] = preds_elo
X_meta_df['winPct_last8'] = preds_last8
X_meta_df['winPct_last16'] = preds_last16
X_meta_df['winPct_last32'] = preds_last32
X_meta_df['winPct_last64'] = preds_last64
X_meta_df['winPct_line'] = preds_line
X_meta_df['winPct_player_nn'] = preds_nn

X_meta_df = X_meta_df.loc[~X_meta_df[['winPct_elo','winPct_last8','winPct_last16','winPct_last32','winPct_last64','winPct_line',
                                     'winPct_player_nn']].isnull().max(1)].copy()
y_meta = X_meta_df.loc[X_meta_df['Season']==2019]['Win'].values
y_meta_test = X_meta_df.loc[X_meta_df['Season']==2020]['Win'].values

X_meta = X_meta_df.loc[X_meta_df['Season']==2019][['winPct_elo','winPct_last8','winPct_last16','winPct_last32','winPct_last64','winPct_line',
                                                   'winPct_player_nn']].values
X_meta_test = X_meta_df.loc[X_meta_df['Season']==2020][['winPct_elo','winPct_last8','winPct_last16','winPct_last32','winPct_last64','winPct_line',
                                                        'winPct_player_nn']].values
X_meta_2021_test = X_meta_df.loc[X_meta_df['Season']==2021][['winPct_elo','winPct_last8','winPct_last16','winPct_last32','winPct_last64','winPct_line',
                                                             'winPct_player_nn']].values

In [None]:
X_meta_df = X_meta_df.loc[~X_meta_df[['winPct_elo','winPct_line']].isnull().max(1)].copy()
y_meta = X_meta_df.loc[X_meta_df['Season']==2019]['Win'].values
y_meta_test = X_meta_df.loc[X_meta_df['Season']==2020]['Win'].values

X_meta = X_meta_df.loc[X_meta_df['Season']==2019][['winPct_elo','winPct_line']].values
X_meta_test = X_meta_df.loc[X_meta_df['Season']==2020][['winPct_elo','winPct_line']].values
X_meta_2021_test = X_meta_df.loc[X_meta_df['Season']==2021][['winPct_elo','winPct_line']].values

In [None]:
lr_meta = LogisticRegression(max_iter=1000)
lr_meta.fit(X_meta, y_meta)
preds_meta = lr_meta.predict_proba(X_meta_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_meta_test, preds_meta))))
print ('AUC score: {}'.format(str(roc_auc_score(y_meta_test, preds_meta))))
#0.6616456184956504

LogLoss score: 0.6607597339829103
AUC score: 0.650942265795207


In [None]:
#baseline - just based on game lines
X_2021 = X_meta_df.loc[X_meta_df['Season']==2021]
print ('LogLoss score: {}'.format(str(log_loss(X_2021['Win'].values, X_2021['winPct_line'].values))))
print ('AUC score: {}'.format(str(roc_auc_score(X_2021['Win'].values, X_2021['winPct_line'].values))))
#0.6491894740554831

LogLoss score: 0.6405089875200644
AUC score: 0.6760801144492131


In [None]:
X_2021 = X_meta_df.loc[X_meta_df['Season']==2021]
#X_2021['winPct'] = lr_meta.predict_proba(X_meta_2021_test)[:,1]
#X_2021['winPct'] = X_2021['winPct_line'].copy() #1.1091120390813323!!!!
X_2021['winPct'] = (X_2021['winPct_line'] + X_2021['winPct_elo'])/2
X_2021['OppWinPct'] = 1 - X_2021['winPct']
print ('LogLoss score: {}'.format(str(log_loss(X_2021['Win'].values, X_2021['winPct'].values))))
print ('AUC score: {}'.format(str(roc_auc_score(X_2021['Win'].values, X_2021['winPct'].values))))
#0.6405089875200644

LogLoss score: 0.643678850036709
AUC score: 0.6714735336194564


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


#2021 Lines Test

In [None]:
lines = pd.read_csv('lines.csv')

In [None]:
lines.columns

Index(['Season', 'Year', 'Date', 'Team', 'Open', 'Close'], dtype='object')

In [None]:
lines = lines.merge(df_all[['Date','Team','Win']], how='left', on=['Date','Team'])

In [None]:
lines['ImpliedPct'] = 0.
lines.loc[lines['Close']<0, 'ImpliedPct'] = lines.loc[lines['Close']<0, 'Close']/(lines.loc[lines['Close']<0, 'Close']-100)
lines.loc[lines['Close']>0, 'ImpliedPct'] = 100/(100+lines.loc[lines['Close']>0, 'Close'])

In [None]:
def _calc_rel_win(lines, X_df, pct_col, pct_col_opp, line_col):
    #combine game lines with predictions
    home = X_df[['Date','Team',pct_col]]
    away = X_df[['Date','Opp',pct_col_opp]]
    away.columns = ['Date','Team',pct_col]
    tmp_df = pd.concat([home, away], ignore_index=True)
    tmp_df = lines.merge(tmp_df, how='inner', on=['Date','Team'])

    #find all games/teams that would have been bet on
    tmp_df = tmp_df.loc[tmp_df[pct_col]>tmp_df['ImpliedPct'], :]

    #calculate ratio of win to bet, assuming a win
    tmp_df['GainRatio'] = 0.
    tmp_df.loc[tmp_df[line_col]<0, 'GainRatio'] = -100/tmp_df.loc[tmp_df[line_col]<0, line_col]
    tmp_df.loc[tmp_df[line_col]>0, 'GainRatio'] = tmp_df.loc[tmp_df[line_col]>0, line_col]/100.

    #calculate bet amount
    tmp_df['NormBet'] = tmp_df[pct_col]-((1-tmp_df[pct_col])/tmp_df['GainRatio'])

    #try capping bets
    #tmp_df.loc[tmp_df['NormBet']>0.1, 'NormBet'] = 0.1

    #calculate net win
    tmp_df['NetWin'] = 0.
    tmp_df.loc[tmp_df['Win']==1, 'NetWin'] = tmp_df.loc[tmp_df['Win']==1, 'NormBet']*tmp_df.loc[tmp_df['Win']==1, 'GainRatio']
    tmp_df.loc[tmp_df['Win']==0, 'NetWin'] = -tmp_df.loc[tmp_df['Win']==0, 'NormBet']

    return tmp_df

In [None]:
df = _calc_rel_win(lines, df_2021, 'winPct','OppWinPct', 'Close')
print (df['NetWin'].sum())

2.2971860905901904


In [None]:
df.loc[df['Date']=='2021-10-12']

Unnamed: 0,Season,Year,Date,Team,Open,Close,Win,ImpliedPct,winPct,GainRatio,NormBet,NetWin
1,2021,2021,2021-10-12,TBL,-140.0,-250,0.0,0.714286,0.72828,0.4,0.048981,-0.048981


# Training

In [None]:
# elo benchmark
preds_elo = 1/(10**(-X_test_df['EloDiff']/400)+1)
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_elo))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_elo))))

LogLoss score: 0.6763902260468215
AUC score: 0.604360911576526


In [None]:
preds_elo = 1/(10**(-X_test_df['EloDiff_538adj']/400)+1)
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_elo))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_elo))))

LogLoss score: 0.6740140773905731
AUC score: 0.604360911576526


In [None]:
#model_test = GradientBoostingClassifier(max_depth=1, min_samples_leaf=1000, subsample=0.7)
model_test = LogisticRegression(max_iter=1000)

param_dist = {
    'solver' : ['liblinear'],
    'penalty' : ['l2','l1']
}
#0.675321
random_search = GridSearchCV(model_test, param_dist, scoring=['neg_log_loss','roc_auc'], refit='neg_log_loss', cv=3, return_train_score=True)
random_search.fit(X_train, y_train)

report_cols = ['mean_test_neg_log_loss','mean_test_roc_auc']+['param_'+param for param in param_dist]
report = pd.DataFrame(random_search.cv_results_)[report_cols].sort_values(by='mean_test_neg_log_loss', ascending=False)
report

Unnamed: 0,mean_test_neg_log_loss,mean_test_roc_auc,param_solver,param_penalty
1,-0.675167,0.594961,liblinear,l1
0,-0.675321,0.594539,liblinear,l2


In [None]:
model_gb = GradientBoostingClassifier(max_depth=1)
model_gb.fit(X_train, y_train)
preds_gb = model_gb.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_gb))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_gb))))
#.6764, max_depth=1, min_samples_leaf=1000, subsample=0.5

LogLoss score: 0.6756183132835901
AUC score: 0.6059693384984285


In [None]:
gb_features = pd.DataFrame()
gb_features['feature'] = features
gb_features['importance'] = model_gb.feature_importances_
gb_features.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
1,EloDiff,0.683369
0,Home,0.158023
11,OppxG_last32,0.030737
2,ShotAttempts_last8,0.022946
5,ShotAttempts_last16,0.021557
14,Goals_last64,0.019695
22,GameScore_median_last64,0.014531
23,GameScore_std_last16,0.008643
17,OppShotAttempts_last64,0.007674
7,OppShotAttempts_last16,0.007091


In [None]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
preds_lr = model_lr.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_lr))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_lr))))

LogLoss score: 0.6787630660770344
AUC score: 0.5890738935171417


In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
preds_rf = model_rf.predict_proba(X_test)[:,1]
print ('LogLoss score: {}'.format(str(log_loss(y_test, preds_rf))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, preds_rf))))

LogLoss score: 0.6912985663090124
AUC score: 0.5654997410556647


In [None]:
def create_model_nn(input_dim):
    model = Sequential()
    model.add(Dense(256, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.7))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.4))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(4))
    model.add(Activation('relu'))
    #model.add(Dropout(0.2))

    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy')
    return model

In [None]:
model_nn = create_model_nn(X_train.shape[1])
model_nn.fit(X_train, y_train, verbose=0, epochs=100, batch_size=512)
preds_nn = model_nn.predict(X_test)[:,0]
print ('LogLoss score: {}'.format(str(log_loss(y_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))
print ('AUC score: {}'.format(str(roc_auc_score(y_test, np.clip(preds_nn, a_min=10e-5, a_max = 1-10e-5)))))

NameError: ignored