In [47]:
# Reading in packages
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [2]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [3]:
# Getting the live_feed data
live_feed = pd.read_sql_query("select * from live_feed", connection)

In [4]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [88]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [6]:
# Getting and filtering the raw data
rawData = pd.merge(live_feed,seasons, how='right',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
rawData = rawData[rawData['gameType'] == 'R']

### Creating Initial Data Sets

In [7]:
# Filtering to the rows with the required data (goals and shots)
statsRaw = rawData[((rawData['eventTypeID'] == 'GOAL') & (rawData['playerType'] == 'Scorer')) |
               ((rawData['eventTypeID'] == 'SHOT') & (rawData['playerType'] == 'Shooter'))]

In [8]:
# Getting counts of the "for" stats.  
statsFor = pd.DataFrame(statsRaw.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsFor = statsFor.rename({0:'count'}, axis=1)

In [9]:
# Pivoting so each stat gets its own column
statsFor = pd.pivot_table(data=statsFor, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

In [10]:
# Defing the stats "against" table
statsAgainst = statsRaw[['gameID','eventTypeID']]

In [11]:
# Getting the "against" team
statsAgainst['teamID'] = statsRaw.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  statsAgainst['teamID'] = statsRaw.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)


In [12]:
# Getting the stats counts
statsAgainst = pd.DataFrame(statsAgainst.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsAgainst = statsAgainst.rename({0:'count'}, axis=1)

In [13]:
# Pivoting so each stat gets its own column
statsAgainst = pd.pivot_table(data=statsAgainst, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

### Create complete schedule table

In [14]:
# Expanding the schedule into a row for each game an individual team played
seasonsExpanded = pd.concat([seasonsFiltered[['seasonID','gameID','gameType','homeTeamID']].rename({'homeTeamID':'teamID'},axis=1),
          seasonsFiltered[['seasonID','gameID','gameType','awayTeamID']].rename({'awayTeamID':'teamID'},axis=1)])

### Extending data set for every game

In [15]:
# Merging the stats for and against into a single table.  
# Notice the left join on seasonsExpanded to ensure we aren't losing any data
stats = pd.merge(seasonsExpanded, 
                 statsFor, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsFor','SHOT':'shotsFor'},axis=1)
stats = pd.merge(stats , 
                 statsAgainst, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsAgainst','SHOT':'shotsAgainst'},axis=1)

In [16]:
# Setting no stats (i.e. NA) to 0
stats['goalsFor'] = stats['goalsFor'].apply(lambda x: 0 if pd.isna(x) else x)
stats['goalsAgainst'] = stats['goalsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsAgainst'] = stats['shotsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsFor'] = stats['shotsFor'].apply(lambda x: 0 if pd.isna(x) else x)

In [17]:
# Sorting the values
stats = stats.sort_values(['seasonID','gameID'])

# Grabbing this now to use in "Extracting the Game Outcome"
statsSimple = stats

### Creating the Statistics

In [18]:
# Creating a game number for every game per team
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [19]:
# Getting the rolling totals for each stat
stats['goalsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['goalsAgainst'].cumsum()
stats['goalsForTotal'] = stats.groupby(['seasonID','teamID'])['goalsFor'].cumsum()
stats['shotsForTotal'] = stats.groupby(['seasonID','teamID'])['shotsFor'].cumsum()
stats['shotsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['shotsAgainst'].cumsum()

In [20]:
# Creating a game number for each team/season
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [21]:
# Getting the rolling totals per game for each stat
stats['goalsAgainstPerGame'] = stats['goalsForTotal']/stats['gameNumber']
stats['goalsForPerGame'] = stats['goalsAgainstTotal']/stats['gameNumber']
stats['shotsAgainstPerGame'] = stats['shotsAgainstTotal']/stats['gameNumber']
stats['shotsForPerGame'] = stats['shotsForTotal']/stats['gameNumber']

In [22]:
# Getting the rolling differentials
stats['goalDifferential'] = stats['goalsForTotal'] - stats['goalsAgainstTotal']
stats['shotDifferential'] = stats['shotsForTotal'] - stats['shotsAgainstTotal']

In [23]:
# Determing whether a team won/lossed and creating a rolling win percentage
stats['winLoss'] = stats.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)
stats['winLossTotal'] = stats.groupby(['seasonID','teamID'])['winLoss'].cumsum()
stats['winningPercentage'] = stats['winLossTotal']/stats['gameNumber']

In [89]:
statsComplete = stats.drop(['goalsFor',
                            'goalsAgainst', 
                            'shotsAgainst', 
                            'shotsFor',
                            'gameNumber',
                            'goalsForTotal',
                            'goalsAgainstTotal',
                            'shotsAgainstTotal',
                            'shotsForTotal',
                            'winLoss',
                            'winLossTotal'], axis=1)

### Extracing the game Outcome

In [90]:
# Getting goals for/against columns
gameOutcome = pd.merge(seasonsFiltered,
         statsSimple[['seasonID','gameID','teamID','goalsFor','goalsAgainst']], 
         how='inner',
         left_on=['seasonID','gameID','homeTeamID'],
         right_on=['seasonID','gameID','teamID'])

In [91]:
# Determing if the home team won
gameOutcome['homeTeamWin'] = gameOutcome.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)

In [92]:
# Getting rid of useless columns
gameOutcome = gameOutcome.drop(['gameType', 
                  'gameDate', 
                  'homeTeamID',
                  'awayTeamID',
                  'teamID','seasonID','goalsFor','goalsAgainst'],axis=1)

### Creating Model Input

In [93]:
# Determining if the team is the home or away, this is needed to join properly again on seasonsFiltered below
statsComplete = pd.merge(statsComplete,seasonsFiltered[['gameID','homeTeamID']], 
                 how='left',
                 left_on=['gameID','teamID'],
                 right_on=['gameID','homeTeamID'])
statsComplete['isHome'] = np.where(pd.isna(statsComplete['homeTeamID']), 0, 1)
statsComplete = statsComplete.drop(['homeTeamID'],axis=1)

In [94]:
# Determing the game number for each time by season
statsComplete = statsComplete.sort_values(['seasonID','gameID'])
statsComplete['gameNumber'] = statsComplete.groupby(['seasonID','teamID','isHome']).cumcount()+1
statsComplete['gameNumber'] += 1

In [95]:
# Sorting and then getting the home and away teams game numbers (used to join on below)
seasonsFiltered = seasonsFiltered.sort_values(['seasonID','gameID'])
seasonsFiltered['homeTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','homeTeamID']).cumcount()+1
seasonsFiltered['awayTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','awayTeamID']).cumcount()+1

In [97]:
# Creating the model input with the home team data
modelInput = pd.merge(seasonsFiltered, 
            statsComplete[statsComplete['isHome'] == 1], 
            how='inner', 
            left_on=['seasonID', 'homeTeamID', 'homeTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'])

In [98]:
# Dropping extra columns
modelInput = modelInput.drop(['gameID_x', 
                 'gameType_x',
                 'homeTeamGameNumber',
                 'teamID',
                 'gameNumber'],
                axis=1)

In [99]:
# Renaming some columns to keep things clean
modelInput = modelInput.rename({"gameID_y":"gameID", "gameType_y":"gameType"}, axis=1)

In [100]:
# adding the away team data to the model input
modelInput = pd.merge(modelInput, 
            statsComplete[statsComplete['isHome'] == 0], 
            how='inner', 
            left_on=['seasonID', 'awayTeamID', 'awayTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'], suffixes=('Home','Away'))

In [102]:
pd.set_option("display.max_columns", None)
modelInput

Unnamed: 0,seasonID,gameDate,homeTeamID,awayTeamID,awayTeamGameNumber,gameIDHome,gameTypeHome,goalsAgainstPerGameHome,goalsForPerGameHome,shotsAgainstPerGameHome,shotsForPerGameHome,goalDifferentialHome,shotDifferentialHome,winningPercentageHome,isHomeHome,gameIDAway,gameTypeAway,teamID,goalsAgainstPerGameAway,goalsForPerGameAway,shotsAgainstPerGameAway,shotsForPerGameAway,goalDifferentialAway,shotDifferentialAway,winningPercentageAway,isHomeAway,gameNumber
0,20102011,2010-10-09,5,8,2,2010020002,R,2.000000,3.000000,24.000000,29.000000,-1.0,5.0,0.000000,1,2010020001,R,8,2.000000,3.000000,21.000000,26.000000,-1.0,5.0,0.000000,0,2
1,20102011,2010-10-11,2,3,2,2010020015,R,4.000000,5.000000,20.000000,44.000000,-1.0,24.0,0.000000,1,2010020013,R,3,6.000000,3.000000,33.000000,22.000000,3.0,-11.0,1.000000,0,2
2,20102011,2010-10-11,19,24,3,2010020019,R,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,1,2010020020,R,24,0.500000,4.000000,42.000000,28.500000,-7.0,-27.0,0.000000,0,3
3,20102011,2010-10-11,7,16,2,2010020013,R,2.500000,3.500000,23.500000,33.500000,-2.0,20.0,0.500000,1,2010020004,R,16,3.000000,4.000000,37.000000,35.000000,-1.0,-2.0,0.000000,0,2
4,20102011,2010-10-11,15,9,2,2010020017,R,4.500000,3.000000,29.000000,25.500000,3.0,-7.0,0.500000,1,2010020014,R,9,1.000000,3.500000,33.500000,21.000000,-5.0,-25.0,0.000000,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12153,20202021,2021-05-18,23,20,28,2020020844,R,2.709091,3.418182,30.127273,26.745455,-39.0,-186.0,0.400000,1,2020020807,R,20,2.849057,2.924528,25.830189,27.150943,-4.0,70.0,0.471698,0,28
12154,20202021,2021-05-08,30,24,28,2020020850,R,3.254545,2.890909,27.854545,25.272727,20.0,-142.0,0.618182,1,2020020850,R,24,2.272727,3.236364,27.672727,24.763636,-53.0,-160.0,0.309091,0,28
12155,20202021,2021-05-08,54,19,28,2020020851,R,3.418182,2.254545,25.290909,29.400000,64.0,226.0,0.709091,1,2020020851,R,19,3.109091,3.072727,27.000000,26.236364,2.0,-42.0,0.490909,0,28
12156,20202021,2021-05-08,26,21,28,2020020852,R,2.600000,3.072727,28.181818,26.054545,-26.0,-117.0,0.381818,1,2020020852,R,21,3.545455,2.418182,23.181818,31.145455,62.0,438.0,0.690909,0,28


In [103]:
# Dropping columns that are no longer needed
modelInput = modelInput.drop(['awayTeamGameNumber',
                              'gameIDHome',
                              'gameTypeHome',
                              'teamID',
                              'gameNumber',
                              'isHomeAway'],axis=1)

In [104]:
# Renaming these to get rid of the suffix that was put on it on the last join
modelInput = modelInput.rename({'gameIDAway':'gameID','gameTypeAway':'gameType'},axis=1)

In [105]:
modelInput = pd.merge(modelInput, gameOutcome, on='gameID')

## Creating the Model

### Normalizing the Data and creating train/test data

In [106]:
# Creating the x and y data
x = modelInput.drop(['seasonID',
                     'gameDate',
                     'homeTeamID',
                     'awayTeamID',
                     'gameID',
                     'gameType',
                     'shotsAgainstPerGameHome', 
                     'shotsForPerGameHome', 
                     'shotDifferentialHome',
                     'shotsAgainstPerGameAway', 
                     'shotsForPerGameAway',
                     'shotDifferentialAway',
                     'homeTeamWin'],axis=1)
y = modelInput[['homeTeamWin']]

In [107]:
x

Unnamed: 0,goalsAgainstPerGameHome,goalsForPerGameHome,goalDifferentialHome,winningPercentageHome,isHomeHome,goalsAgainstPerGameAway,goalsForPerGameAway,goalDifferentialAway,winningPercentageAway
0,2.000000,3.000000,-1.0,0.000000,1,2.000000,3.000000,-1.0,0.000000
1,4.000000,5.000000,-1.0,0.000000,1,6.000000,3.000000,3.0,1.000000
2,0.000000,0.000000,0.0,0.000000,1,0.500000,4.000000,-7.0,0.000000
3,2.500000,3.500000,-2.0,0.500000,1,3.000000,4.000000,-1.0,0.000000
4,4.500000,3.000000,3.0,0.500000,1,1.000000,3.500000,-5.0,0.000000
...,...,...,...,...,...,...,...,...,...
12153,2.709091,3.418182,-39.0,0.400000,1,2.849057,2.924528,-4.0,0.471698
12154,3.254545,2.890909,20.0,0.618182,1,2.272727,3.236364,-53.0,0.309091
12155,3.418182,2.254545,64.0,0.709091,1,3.109091,3.072727,2.0,0.490909
12156,2.600000,3.072727,-26.0,0.381818,1,3.545455,2.418182,62.0,0.690909


In [108]:
# Normalizing the data
scaler = preprocessing.StandardScaler().fit(x)
xScaled = scaler.transform(x)

In [109]:
xTrain, xTest, yTrain, yTest = train_test_split(xScaled, y, test_size=0.2,random_state=109) # 70% training and 20% test

### Support Vector Machine

In [129]:
clf = svm.SVC(kernel='linear',gamma='auto') # Linear Kernel

In [130]:
clf.fit(xTrain, yTrain['homeTeamWin'].values)

SVC(C=10, gamma='auto', kernel='linear')

In [131]:
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

Train Accuracy: 0.6102200287888135
Test Accuracy: 0.6069078947368421
