In [1]:
# Reading in packages
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [None]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [None]:
# Getting the live_feed data
live_feed = pd.read_sql_query("select * from live_feed", connection)

In [None]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [None]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [None]:
# Getting and filtering the raw data
rawData = pd.merge(live_feed,seasons, how='right',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
rawData = rawData[rawData['gameType'] == 'R']

### Creating Initial Data Sets

In [None]:
# Filtering to the rows with the required data (goals and shots)
statsRaw = rawData[((rawData['eventTypeID'] == 'GOAL') & (rawData['playerType'] == 'Scorer')) |
               ((rawData['eventTypeID'] == 'SHOT') & (rawData['playerType'] == 'Shooter'))]

In [None]:
# Getting counts of the "for" stats.  
statsFor = pd.DataFrame(statsRaw.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsFor = statsFor.rename({0:'count'}, axis=1)

In [None]:
# Pivoting so each stat gets its own column
statsFor = pd.pivot_table(data=statsFor, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

In [None]:
# Defing the stats "against" table
statsAgainst = statsRaw[['gameID','eventTypeID']]

In [None]:
# Getting the "against" team
statsAgainst['teamID'] = statsRaw.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)

In [None]:
# Getting the stats counts
statsAgainst = pd.DataFrame(statsAgainst.groupby(['gameID','teamID','eventTypeID']).size()).reset_index()
statsAgainst = statsAgainst.rename({0:'count'}, axis=1)

In [None]:
# Pivoting so each stat gets its own column
statsAgainst = pd.pivot_table(data=statsAgainst, values='count',index=['gameID','teamID'], columns='eventTypeID').reset_index()

### Create complete schedule table

In [None]:
# Expanding the schedule into a row for each game an individual team played
seasonsExpanded = pd.concat([seasonsFiltered[['seasonID','gameID','gameType','homeTeamID']].rename({'homeTeamID':'teamID'},axis=1),
          seasonsFiltered[['seasonID','gameID','gameType','awayTeamID']].rename({'awayTeamID':'teamID'},axis=1)])

### Extending data set for every game

In [None]:
# Merging the stats for and against into a single table.  
# Notice the left join on seasonsExpanded to ensure we aren't losing any data
stats = pd.merge(seasonsExpanded, 
                 statsFor, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsFor','SHOT':'shotsFor'},axis=1)
stats = pd.merge(stats , 
                 statsAgainst, 
                 how='left', 
                 left_on=['gameID','teamID'],
                 right_on=['gameID','teamID'])
stats = stats.rename({'GOAL':'goalsAgainst','SHOT':'shotsAgainst'},axis=1)

In [None]:
# Setting no stats (i.e. NA) to 0
stats['goalsFor'] = stats['goalsFor'].apply(lambda x: 0 if pd.isna(x) else x)
stats['goalsAgainst'] = stats['goalsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsAgainst'] = stats['shotsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)
stats['shotsFor'] = stats['shotsFor'].apply(lambda x: 0 if pd.isna(x) else x)

In [None]:
# Sorting the values
stats = stats.sort_values(['seasonID','gameID'])

# Grabbing this now to use in "Extracting the Game Outcome"
statsSimple = stats

### Creating the Statistics

In [None]:
# Creating a game number for every game per team
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [None]:
# Getting the rolling totals for each stat
stats['goalsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['goalsAgainst'].cumsum()
stats['goalsForTotal'] = stats.groupby(['seasonID','teamID'])['goalsFor'].cumsum()
stats['shotsForTotal'] = stats.groupby(['seasonID','teamID'])['shotsFor'].cumsum()
stats['shotsAgainstTotal'] = stats.groupby(['seasonID','teamID'])['shotsAgainst'].cumsum()

In [None]:
# Creating a game number for each team/season
stats['gameNumber'] = stats.groupby(['seasonID','teamID']).cumcount()+1

In [None]:
# Getting the rolling totals per game for each stat
stats['goalsAgainstPerGame'] = stats['goalsForTotal']/stats['gameNumber']
stats['goalsForPerGame'] = stats['goalsAgainstTotal']/stats['gameNumber']
stats['shotsAgainstPerGame'] = stats['shotsAgainstTotal']/stats['gameNumber']
stats['shotsForPerGame'] = stats['shotsForTotal']/stats['gameNumber']

In [None]:
# Getting the rolling differentials
stats['goalDifferential'] = stats['goalsForTotal'] - stats['goalsAgainstTotal']
stats['shotDifferential'] = stats['shotsForTotal'] - stats['shotsAgainstTotal']

In [None]:
# Determing whether a team won/lossed and creating a rolling win percentage
stats['winLoss'] = stats.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)
stats['winLossTotal'] = stats.groupby(['seasonID','teamID'])['winLoss'].cumsum()
stats['winningPercentage'] = stats['winLossTotal']/stats['gameNumber']

In [None]:
statsComplete = stats.drop(['goalsFor',
                            'goalsAgainst', 
                            'shotsAgainst', 
                            'shotsFor',
                            'gameNumber',
                            'goalsForTotal',
                            'goalsAgainstTotal',
                            'shotsAgainstTotal',
                            'shotsForTotal',
                            'winLoss',
                            'winLossTotal'], axis=1)

### Extracing the game Outcome

In [None]:
# Getting goals for/against columns
gameOutcome = pd.merge(seasonsFiltered,
         statsSimple[['seasonID','gameID','teamID','goalsFor','goalsAgainst']], 
         how='inner',
         left_on=['seasonID','gameID','homeTeamID'],
         right_on=['seasonID','gameID','teamID'])

In [None]:
# Determing if the home team won
gameOutcome['homeTeamWin'] = gameOutcome.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)

In [None]:
# Getting rid of useless columns
gameOutcome = gameOutcome.drop(['gameType', 
                  'gameDate', 
                  'homeTeamID',
                  'awayTeamID',
                  'teamID','seasonID','goalsFor','goalsAgainst'],axis=1)

### Creating Model Input

In [None]:
# Determining if the team is the home or away, this is needed to join properly again on seasonsFiltered below
statsComplete = pd.merge(statsComplete,seasonsFiltered[['gameID','homeTeamID']], 
                 how='left',
                 left_on=['gameID','teamID'],
                 right_on=['gameID','homeTeamID'])
statsComplete['isHome'] = np.where(pd.isna(statsComplete['homeTeamID']), 0, 1)
statsComplete = statsComplete.drop(['homeTeamID'],axis=1)

In [None]:
# Determing the game number for each time by season
statsComplete = statsComplete.sort_values(['seasonID','gameID'])
statsComplete['gameNumber'] = statsComplete.groupby(['seasonID','teamID','isHome']).cumcount()+1
statsComplete['gameNumber'] += 1

In [None]:
# Sorting and then getting the home and away teams game numbers (used to join on below)
seasonsFiltered = seasonsFiltered.sort_values(['seasonID','gameID'])
seasonsFiltered['homeTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','homeTeamID']).cumcount()+1
seasonsFiltered['awayTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','awayTeamID']).cumcount()+1

In [None]:
# Creating the model input with the home team data
modelInput = pd.merge(seasonsFiltered, 
            statsComplete[statsComplete['isHome'] == 1], 
            how='inner', 
            left_on=['seasonID', 'homeTeamID', 'homeTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'])

In [None]:
# Dropping extra columns
modelInput = modelInput.drop(['gameID_x', 
                 'gameType_x',
                 'homeTeamGameNumber',
                 'teamID',
                 'gameNumber'],
                axis=1)

In [None]:
# Renaming some columns to keep things clean
modelInput = modelInput.rename({"gameID_y":"gameID", "gameType_y":"gameType"}, axis=1)

In [None]:
# adding the away team data to the model input
modelInput = pd.merge(modelInput, 
            statsComplete[statsComplete['isHome'] == 0], 
            how='inner', 
            left_on=['seasonID', 'awayTeamID', 'awayTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'], suffixes=('Home','Away'))

In [None]:
pd.set_option("display.max_columns", None)
modelInput

In [None]:
# Dropping columns that are no longer needed
modelInput = modelInput.drop(['awayTeamGameNumber',
                              'gameIDHome',
                              'gameTypeHome',
                              'teamID',
                              'gameNumber',
                              'isHomeAway'],axis=1)

In [None]:
# Renaming these to get rid of the suffix that was put on it on the last join
modelInput = modelInput.rename({'gameIDAway':'gameID','gameTypeAway':'gameType'},axis=1)

In [None]:
modelInput = pd.merge(modelInput, gameOutcome, on='gameID')

## Creating the Model

### Normalizing the Data and creating train/test data

In [None]:
# Creating the x and y data
x = modelInput.drop(['seasonID',
                     'gameDate',
                     'homeTeamID',
                     'awayTeamID',
                     'gameID',
                     'gameType',
#                      'shotsAgainstPerGameHome', 
#                      'shotsForPerGameHome', 
#                      'shotDifferentialHome',
#                      'shotsAgainstPerGameAway', 
#                      'shotsForPerGameAway',
#                      'shotDifferentialAway',
                     'homeTeamWin'],axis=1)
y = modelInput[['homeTeamWin']]

In [None]:
x

In [None]:
# Normalizing the data
scaler = preprocessing.StandardScaler().fit(x)
xScaled = scaler.transform(x)

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(xScaled, y, test_size=0.2,random_state=109) # 70% training and 20% test

### Support Vector Machine

In [None]:
clf = svm.SVC(kernel='linear',gamma='auto') # Linear Kernel

In [None]:
clf.fit(xTrain, yTrain['homeTeamWin'].values)

In [None]:
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))