In [None]:
# Reading in packages
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [None]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [None]:
# Getting the live_feed data
live_feed = pd.read_sql_query("select * from live_feed", connection)

In [None]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [None]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [None]:
# Getting and filtering the raw data
rawData = pd.merge(live_feed,seasons, how='right',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
rawData = rawData[rawData['gameType'] == 'R']

### Creating Initial Data Sets

In [7]:
# Filtering to the rows with the required data (goals and shots)
statsRaw = rawData[((rawData['eventTypeID'] == 'GOAL') & (rawData['playerType'] == 'Scorer')) |
               ((rawData['eventTypeID'] == 'SHOT') & (rawData['playerType'] == 'Shooter'))]

In [8]:
# Getting counts of the "for" stats.  
statsFor = pd.DataFrame(statsRaw.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsFor = statsFor.rename({0:'count'}, axis=1)

In [9]:
# Pivoting so each stat gets its own column
statsFor = pd.pivot_table(data=statsFor, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

In [10]:
# Defing the stats "against" table
statsAgainst = statsRaw[['gameID','eventTypeID']]

In [11]:
# Getting the "against" team
statsAgainst['teamID'] = statsRaw.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  statsAgainst['teamID'] = statsRaw.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)


In [12]:
# Getting the stats counts
statsAgainst = pd.DataFrame(statsAgainst.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsAgainst = statsAgainst.rename({0:'count'}, axis=1)

In [13]:
# Pivoting so each stat gets its own column
statsAgainst = pd.pivot_table(data=statsAgainst, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

### Create complete schedule table

In [14]:
# Expanding the schedule into a row for each game an individual team played
seasonsExpanded = pd.concat([seasonsFiltered[['seasonID','gameID','gameType','homeTeamID']].rename({'homeTeamID':'teamID'},axis=1),
          seasonsFiltered[['seasonID','gameID','gameType','awayTeamID']].rename({'awayTeamID':'teamID'},axis=1)])

### Extending data set for every game

In [15]:
# Merging the stats for and against into a single table.  
# Notice the left join on seasonsExpanded to ensure we aren't losing any data
stats = pd.merge(seasonsExpanded, 
                 statsFor, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsFor','SHOT':'shotsFor'},axis=1)
stats = pd.merge(stats , 
                 statsAgainst, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsAgainst','SHOT':'shotsAgainst'},axis=1)

In [16]:
# Setting no stats (i.e. NA) to 0
stats['goalsFor'] = stats['goalsFor'].apply(lambda x: 0 if pd.isna(x) else x)
stats['goalsAgainst'] = stats['goalsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsAgainst'] = stats['shotsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsFor'] = stats['shotsFor'].apply(lambda x: 0 if pd.isna(x) else x)

In [17]:
# Sorting the values
stats = stats.sort_values(['seasonID','gameID'])

# Grabbing this now to use in "Extracting the Game Outcome"
statsSimple = stats

### Creating the Statistics

In [18]:
# Creating a game number for every game per team
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [19]:
# Getting the rolling totals for each stat
stats['goalsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['goalsAgainst'].cumsum()
stats['goalsForTotal'] = stats.groupby(['seasonID','teamID'])['goalsFor'].cumsum()
stats['shotsForTotal'] = stats.groupby(['seasonID','teamID'])['shotsFor'].cumsum()
stats['shotsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['shotsAgainst'].cumsum()

In [20]:
# Creating a game number for each team/season
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [21]:
# Getting the rolling totals per game for each stat
stats['goalsAgainstPerGame'] = stats['goalsForTotal']/stats['gameNumber']
stats['goalsForPerGame'] = stats['goalsAgainstTotal']/stats['gameNumber']
stats['shotsAgainstPerGame'] = stats['shotsAgainstTotal']/stats['gameNumber']
stats['shotsForPerGame'] = stats['shotsForTotal']/stats['gameNumber']

In [22]:
# Getting the rolling differentials
stats['goalDifferential'] = stats['goalsForTotal'] - stats['goalsAgainstTotal']
stats['shotDifferential'] = stats['shotsForTotal'] - stats['shotsAgainstTotal']

In [23]:
# Determing whether a team won/lossed and creating a rolling win percentage
stats['winLoss'] = stats.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)
stats['winLossTotal'] = stats.groupby(['seasonID','teamID'])['winLoss'].cumsum()
stats['winningPercentage'] = stats['winLossTotal']/stats['gameNumber']

In [24]:
statsComplete = stats.drop(['goalsFor',
                            'goalsAgainst', 
                            'shotsAgainst', 
                            'shotsFor',
                            'gameNumber',
                            'goalsForTotal',
                            'goalsAgainstTotal',
                            'shotsAgainstTotal',
                            'shotsForTotal',
                            'winLoss',
                            'winLossTotal'], axis=1)

### Adding Q Values

In [25]:
# Getting the seasons data
# boxscores = pd.read_sql_query("select gameID, teamID,playerID from box_scores where timeOnIce is not null and scratched=0", connection)

In [26]:
# Merging boxscores with seasons to get the seasonID
# boxscores = pd.merge(seasonsFiltered[['seasonID','gameID']], boxscores, how='inner')

In [27]:
# Sorting to make sure the row count is correct
# boxscores = boxscores.sort_values(['seasonID','gameID'])

In [28]:
# # Getting the previous game for each game/player combo
# previousGame = []
# for index, row in boxscores.iterrows():
#     if index % 10000 == 0:
#         print(int((index/len(boxscores))*100),'%')
#     preGame = boxscores[(boxscores['gameID'] < row['gameID']) & (boxscores['playerID'] == row['playerID'])]['gameID'].values
#     if len(preGame) == 0:
#         previousGame.append(np.nan)
#     else:
#         previousGame.append(max(preGame))

In [29]:
# seasonsList = seasonsFiltered['seasonID'].unique()
# previousSeasons = []
# for index, row in boxscores.iterrows():
#     if index % 10000 == 0:
#         print(int((index/len(boxscores))*100),'%')
#     season = seasonsList[seasonsList < row['seasonID']]
#     if len(season) == 0:
#         previousSeasons.append(np.nan)
#     else:
#         previousSeasons.append(max(season))
    
# #     break
    

In [30]:
# boxscores['previousSeason'] = previousSeasons
# boxscores['previousSeason'] = boxscores['previousSeason'].fillna(20092010)
# boxscores['previousSeason'] = boxscores['previousSeason'].astype(int)

In [31]:
# Getting the game number if the player played in the game
# boxscores['gameNumber'] = boxscores.groupby(['seasonID','playerID']).cumcount()+1

In [32]:
boxscores = pd.read_csv('boxscores.csv',index_col=0)
# boxscores.to_csv('boxscores.csv')

In [33]:
# Reading in the q values
qValues = pd.read_csv('deep_rl_results2.csv',index_col=0)

  mask |= (ar1 == a)


In [34]:
# Summing up the values by game
qValues = qValues[['gameID',
                   'playerID',
                   'value']].groupby(['gameID',
                                         'playerID']).sum('value').reset_index()

In [35]:
# Merging with boxscores
qValues = pd.merge(boxscores, qValues, on=['gameID','playerID'],how='left')

In [36]:
# Filling NA values (i.e. games where a player didn't show up)
qValues['value'] = qValues['value'].fillna(0)

In [37]:
qValues = qValues.sort_values(['seasonID','gameID'])

In [38]:
# Summing up the q values by player/season/game
qValues['cumulativeValue'] = qValues.groupby(['seasonID','playerID'])['value'].cumsum(axis=0)

In [39]:
# Getting the q value per game
qValues['averageValue'] = qValues['cumulativeValue']/qValues['gameNumber']

In [40]:
qValues[qValues['playerID'] == 8478402]

Unnamed: 0,seasonID,gameID,teamID,playerID,gameNumber,previousGameID,previousSeason,value,cumulativeValue,averageValue
215472,20152016,2015020008,22,8478402,1,,20142015,0.562524,0.562524,0.562524
216083,20152016,2015020024,22,8478402,2,2.015020e+09,20142015,0.215907,0.778431,0.389215
216786,20152016,2015020043,22,8478402,3,2.015020e+09,20142015,0.224142,1.002572,0.334191
217301,20152016,2015020056,22,8478402,4,2.015020e+09,20142015,0.000000,1.002572,0.250643
217948,20152016,2015020073,22,8478402,5,2.015020e+09,20142015,0.499726,1.502298,0.300460
...,...,...,...,...,...,...,...,...,...,...
477795,20202021,2020020789,22,8478402,52,2.020021e+09,20192020,0.263599,32.484847,0.624709
478478,20202021,2020020807,22,8478402,53,2.020021e+09,20192020,0.334360,32.819208,0.619230
479297,20202021,2020020829,22,8478402,54,2.020021e+09,20192020,0.717232,33.536440,0.621045
479866,20202021,2020020844,22,8478402,55,2.020021e+09,20192020,0.000000,33.536440,0.609753


In [41]:
# Getting the average q value per game by season
qSeasonValues = qValues[['seasonID','averageValue']].groupby('seasonID').mean().reset_index()
# Renaming the column
qSeasonValues = qSeasonValues.rename({'averageValue':'averageValueSeason'},axis=1)

In [42]:
# Merging boxscores with qValues to get the qValues up to the last game
boxScoresComplete = pd.merge(boxscores, 
         qValues[['gameID','playerID','averageValue']],
         left_on=['previousGameID','playerID'],
         right_on=['gameID','playerID'], 
         how='left',
         suffixes=('', '_y'))

In [43]:
# Getting the qValues by season
boxScoresComplete = pd.merge(boxScoresComplete,
         qSeasonValues, 
         left_on='previousSeason',
         right_on='seasonID',
         how='inner',
         suffixes=('', '_y'))

In [44]:
# Dropping useless columns
boxScoresComplete = boxScoresComplete.drop(['gameID_y','seasonID_y'],axis=1)

In [45]:
# setting the averageValue 
boxScoresComplete['averageValue'] = np.where(boxScoresComplete['averageValue'].isna(),boxScoresComplete['averageValueSeason'],boxScoresComplete['averageValue'])

In [46]:
boxScoresComplete = boxScoresComplete[['seasonID',
                   'gameID',
                   'teamID',
                   'averageValue']].groupby(['seasonID','gameID','teamID']).sum().reset_index()

In [47]:
statsComplete = pd.merge(statsComplete, boxScoresComplete)

In [48]:
statsComplete

Unnamed: 0,seasonID,gameID,gameType,teamID,goalsAgainstPerGame,goalsForPerGame,shotsAgainstPerGame,shotsForPerGame,goalDifferential,shotDifferential,winningPercentage,averageValue
0,20112012,2011020001,R,6,1.000000,2.000000,27.000000,22.000000,-1.0,-5.0,0.000000,8.729403
1,20112012,2011020001,R,4,2.000000,1.000000,22.000000,27.000000,1.0,5.0,1.000000,8.572032
2,20112012,2011020002,R,10,2.000000,0.000000,32.000000,16.000000,2.0,-16.0,1.000000,9.271077
3,20112012,2011020002,R,8,0.000000,2.000000,16.000000,32.000000,-2.0,16.0,0.000000,8.494263
4,20112012,2011020003,R,23,3.000000,5.000000,25.000000,35.000000,-2.0,10.0,0.000000,7.946408
...,...,...,...,...,...,...,...,...,...,...,...,...
22743,20202021,2020020866,R,19,3.071429,3.089286,26.964286,26.125000,-1.0,-47.0,0.482143,6.981281
22744,20202021,2020020867,R,26,2.589286,3.071429,28.160714,25.875000,-27.0,-128.0,0.375000,7.060109
22745,20202021,2020020867,R,21,3.535714,2.410714,23.053571,31.071429,63.0,449.0,0.696429,7.659978
22746,20202021,2020020868,R,28,2.803571,3.642857,28.696429,27.660714,-47.0,-58.0,0.375000,6.332358


### Extracing the game Outcome

In [49]:
# Getting goals for/against columns
gameOutcome = pd.merge(seasonsFiltered,
         statsSimple[['seasonID','gameID','teamID','goalsFor','goalsAgainst']], 
         how='inner',
         left_on=['seasonID','gameID','homeTeamID'],
         right_on=['seasonID','gameID','teamID'])

In [50]:
# Determing if the home team won
gameOutcome['homeTeamWin'] = gameOutcome.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)

In [51]:
# Getting rid of useless columns
gameOutcome = gameOutcome.drop(['gameType', 
                  'gameDate', 
                  'homeTeamID',
                  'awayTeamID',
                  'teamID','seasonID','goalsFor','goalsAgainst'],axis=1)

### Creating Model Input

In [52]:
# Determining if the team is the home or away, this is needed to join properly again on seasonsFiltered below
statsComplete = pd.merge(statsComplete,seasonsFiltered[['gameID','homeTeamID']], 
                 how='left',
                 left_on=['gameID','teamID'],
                 right_on=['gameID','homeTeamID'])
statsComplete['isHome'] = np.where(pd.isna(statsComplete['homeTeamID']), 0, 1)
statsComplete = statsComplete.drop(['homeTeamID'],axis=1)

In [53]:
# Determing the game number for each time by season
statsComplete = statsComplete.sort_values(['seasonID','gameID'])
statsComplete['gameNumber'] = statsComplete.groupby(['seasonID','teamID','isHome']).cumcount()+1
statsComplete['gameNumber'] += 1

In [54]:
# Sorting and then getting the home and away teams game numbers (used to join on below)
seasonsFiltered = seasonsFiltered.sort_values(['seasonID','gameID'])
seasonsFiltered['homeTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','homeTeamID']).cumcount()+1
seasonsFiltered['awayTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','awayTeamID']).cumcount()+1

In [55]:
# Creating the model input with the home team data
modelInput = pd.merge(seasonsFiltered, 
            statsComplete[statsComplete['isHome'] == 1], 
            how='inner', 
            left_on=['seasonID', 'homeTeamID', 'homeTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'])

In [56]:
# Dropping extra columns
modelInput = modelInput.drop(['gameID_x', 
                 'gameType_x',
                 'homeTeamGameNumber',
                 'teamID',
                 'gameNumber'],
                axis=1)

In [57]:
# Renaming some columns to keep things clean
modelInput = modelInput.rename({"gameID_y":"gameID", "gameType_y":"gameType"}, axis=1)

In [58]:
# adding the away team data to the model input
modelInput = pd.merge(modelInput, 
            statsComplete[statsComplete['isHome'] == 0], 
            how='inner', 
            left_on=['seasonID', 'awayTeamID', 'awayTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'], suffixes=('Home','Away'))

In [59]:
pd.set_option("display.max_columns", None)
modelInput

Unnamed: 0,seasonID,gameDate,homeTeamID,awayTeamID,awayTeamGameNumber,gameIDHome,gameTypeHome,goalsAgainstPerGameHome,goalsForPerGameHome,shotsAgainstPerGameHome,shotsForPerGameHome,goalDifferentialHome,shotDifferentialHome,winningPercentageHome,averageValueHome,isHomeHome,gameIDAway,gameTypeAway,teamID,goalsAgainstPerGameAway,goalsForPerGameAway,shotsAgainstPerGameAway,shotsForPerGameAway,goalDifferentialAway,shotDifferentialAway,winningPercentageAway,averageValueAway,isHomeAway,gameNumber
0,20112012,2011-10-08,6,14,2,2011020001,R,1.000000,2.000000,27.000000,22.000000,-1.0,-5.0,0.000000,8.729403,1,2011020004,R,14,5.000000,1.000000,32.000000,29.000000,4.0,-3.0,1.000000,9.401039,0,2
1,20112012,2011-10-08,10,9,2,2011020002,R,2.000000,0.000000,32.000000,16.000000,2.0,-16.0,1.000000,9.271077,1,2011020005,R,9,3.000000,5.000000,34.000000,29.000000,-2.0,-5.0,0.000000,8.103880,0,2
2,20112012,2011-10-10,1,12,2,2011020012,R,0.000000,3.000000,26.000000,20.000000,-3.0,-6.0,0.000000,7.760454,1,2011020013,R,12,2.000000,4.500000,30.500000,30.000000,-5.0,-1.0,0.000000,12.252236,0,2
3,20112012,2011-10-10,15,14,3,2011020013,R,4.000000,3.000000,28.000000,32.000000,1.0,4.0,1.000000,8.883380,1,2011020010,R,14,3.000000,2.500000,35.000000,27.000000,1.0,-16.0,0.500000,10.845768,0,3
4,20112012,2011-10-10,25,27,2,2011020008,R,2.000000,1.000000,37.000000,31.000000,1.0,-6.0,1.000000,8.092287,1,2011020021,R,27,3.000000,6.000000,46.000000,26.000000,-3.0,-20.0,0.000000,8.156502,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10961,20202021,2021-05-18,23,20,28,2020020844,R,2.709091,3.418182,30.127273,26.745455,-39.0,-186.0,0.400000,6.705435,1,2020020807,R,20,2.849057,2.924528,25.830189,27.150943,-4.0,70.0,0.471698,7.067978,0,28
10962,20202021,2021-05-08,30,24,28,2020020850,R,3.254545,2.890909,27.854545,25.272727,20.0,-142.0,0.618182,7.661603,1,2020020850,R,24,2.272727,3.236364,27.672727,24.763636,-53.0,-160.0,0.309091,7.395856,0,28
10963,20202021,2021-05-08,54,19,28,2020020851,R,3.418182,2.254545,25.290909,29.400000,64.0,226.0,0.709091,7.326037,1,2020020851,R,19,3.109091,3.072727,27.000000,26.236364,2.0,-42.0,0.490909,6.960778,0,28
10964,20202021,2021-05-08,26,21,28,2020020852,R,2.600000,3.072727,28.181818,26.054545,-26.0,-117.0,0.381818,6.816798,1,2020020852,R,21,3.545455,2.418182,23.181818,31.145455,62.0,438.0,0.690909,7.655238,0,28


In [60]:
# Dropping columns that are no longer needed
modelInput = modelInput.drop(['awayTeamGameNumber',
                              'gameIDHome',
                              'gameTypeHome',
                              'teamID',
                              'gameNumber',
                              'isHomeAway'],axis=1)

In [61]:
# Renaming these to get rid of the suffix that was put on it on the last join
modelInput = modelInput.rename({'gameIDAway':'gameID','gameTypeAway':'gameType'},axis=1)

In [62]:
modelInput = pd.merge(modelInput, gameOutcome, on='gameID')

## Creating the Model

### Normalizing the Data and creating train/test data

In [63]:
# Creating the x and y data
x = modelInput.drop(['seasonID',
                     'gameDate',
                     'homeTeamID',
                     'awayTeamID',
                     'gameID',
                     'gameType',
#                      'shotsAgainstPerGameHome', 
#                      'shotsForPerGameHome', 
#                      'shotDifferentialHome',
#                      'shotsAgainstPerGameAway', 
#                      'shotsForPerGameAway',
#                      'shotDifferentialAway',
                     'homeTeamWin'],axis=1)
y = modelInput[['homeTeamWin']]

In [64]:
# Normalizing the data
scaler = preprocessing.StandardScaler().fit(x)
xScaled = scaler.transform(x)

In [65]:
xTrain, xTest, yTrain, yTest = train_test_split(xScaled, y, test_size=0.2,random_state=109) # 70% training and 20% test

### Support Vector Machine

In [66]:
clf = svm.SVC(kernel='linear',gamma='auto') # Linear Kernel

In [67]:
clf.fit(xTrain, yTrain['homeTeamWin'].values)

SVC(gamma='auto', kernel='linear')

In [68]:
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

Train Accuracy: 0.6114911080711354
Test Accuracy: 0.6043755697356427
