In [240]:
# Reading in packages
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [2]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [3]:
# Getting the live_feed data
live_feed = pd.read_sql_query("select * from live_feed", connection)

In [4]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [None]:
def create_model_input(seasons, live_feed):
    # Filtering to regular seasons games and 20102011 onwards (when live data started)
    seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
    seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']
    
    # Getting and filtering the raw data
    rawData = pd.merge(live_feed,seasons, how='inner',on='gameID')
    rawData = rawData[rawData['seasonID'] >= 20102011]
    rawData = rawData[rawData['gameType'] == 'R']
    
    # Filtering to goals only
    goals = rawData[(rawData['eventTypeID'] == 'GOAL') & (rawData['playerType'] == 'Scorer')]
    
    # Getting the number of goals each team scored in each game (if they scored a goal)
    goalsForPerGame = pd.DataFrame(goals[['gameID','teamID']].groupby(['gameID','teamID']).size()).reset_index()
    goalsForPerGame = goalsForPerGame.rename({0:'goalsFor'}, axis=1)
    
    

In [90]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [6]:
# Getting and filtering the raw data
rawData = pd.merge(live_feed,seasons, how='inner',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
rawData = rawData[rawData['gameType'] == 'R']

## Goal Related Data

In [7]:
# Filtering to goals only
goals = rawData[(rawData['eventTypeID'] == 'GOAL') & (rawData['playerType'] == 'Scorer')]

In [8]:
# Getting the number of goals each team scored in each game (if they scored a goal)
goalsForPerGame = pd.DataFrame(goals[['gameID','teamID']].groupby(['gameID','teamID']).size()).reset_index()
goalsForPerGame = goalsForPerGame.rename({0:'goalsFor'}, axis=1)

In [80]:
# Getting the number of goals each team gave up in each game (if they gave up any)
goalsAgainstPerGame = goals[['gameID','seasonID']]
goalsAgainstPerGame['teamID'] = goals.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goalsAgainstPerGame['teamID'] = goals.apply(lambda row: row['homeTeamID'] if row['teamID'] != row['homeTeamID'] else row['awayTeamID'],axis=1)


## Shot Related Data

In [82]:
goalsAgainstPerGame = pd.DataFrame(goalsAgainstPerGame[['gameID','teamID']].groupby(['gameID','teamID']).size()).reset_index()
goalsAgainstPerGame = goalsAgainstPerGame.rename({0:'goalsAgainst'}, axis=1)

In [299]:
shotsAgainstPerGame[['gameID','teamID']].groupby(['gameID','teamID']).size()

gameID      teamID
2010020001  8         42
            10        52
2010020002  4         58
            5         48
2010020003  12        52
                      ..
2020020866  54        40
2020020867  21        32
            26        54
2020020868  28        54
            53        88
Length: 25156, dtype: int64

In [92]:
# Expanding the schedule into a row for each game an individual team played
seasonsExpanded = pd.concat([seasonsFiltered[['seasonID','gameID','gameType','homeTeamID']].rename({'homeTeamID':'teamID'},axis=1),
          seasonsFiltered[['seasonID','gameID','gameType','awayTeamID']].rename({'awayTeamID':'teamID'},axis=1)])

In [121]:
# Merging the goals for and against data into a single table. Not the left join to ensure no games are lost.
teamData = pd.merge(seasonsExpanded, goalsForPerGame, how='left', left_on=['gameID','teamID'],right_on=['gameID','teamID'])
teamData = pd.merge(teamData, goalsAgainstPerGame, how='left', left_on=['gameID','teamID'],right_on=['gameID','teamID'])

# Setting shutouts (i.e. NA) to 0
teamData['goalsFor'] = teamData['goalsFor'].apply(lambda x: 0 if pd.isna(x) else x)
teamData['goalsAgainst'] = teamData['goalsAgainst'].apply(lambda x: 0 if pd.isna(x) else x)

In [122]:
# Creating a game number for each team/season
teamData['gameNumber'] = teamData.groupby(['seasonID','teamID']).cumcount()+1

In [123]:
# Totalling the number of goals for and against as a rolling sum
teamData['goalsForTotal'] = teamData.groupby(['seasonID','teamID'])['goalsFor'].cumsum()
teamData['goalsAgainstTotal'] = teamData.groupby(['seasonID','teamID'])['goalsAgainst'].cumsum()

In [124]:
# Getting the rolling goals for and against per game
teamData['goalsForPerGame'] = teamData['goalsForTotal']/teamData['gameNumber']
teamData['goalsAgainstPerGame'] = teamData['goalsAgainstTotal']/teamData['gameNumber']

In [125]:
# Getting the rolling goal differential
teamData['goalDifferential'] = teamData['goalsForTotal'] - teamData['goalsAgainstTotal']

In [126]:
# Determing whether a team won/lossed and creating a rolling win percentage
teamData['winLoss'] = teamData.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)
teamData['winLossTotal'] = teamData.groupby(['seasonID','teamID'])['winLoss'].cumsum()
teamData['winningPercentage'] = teamData['winLossTotal']/teamData['gameNumber']

In [127]:
# Dropping extra columns to keep things clean
teamDataCondensed = teamData.drop(['goalsFor',
                        'goalsAgainst',
                        'gameNumber',
                        'goalsForTotal',
                        'goalsAgainstTotal',
                        'winLoss',
                        'winLossTotal'], axis=1)

## Shot Related Data

## Extracing the game Outcome

In [171]:
gameOutcome = pd.merge(seasonsFiltered,
         teamData, 
         how='inner',
         left_on=['seasonID','gameID','homeTeamID'],
         right_on=['seasonID','gameID','teamID'])

In [179]:
gameOutcome = gameOutcome.drop(['gameType_x', 
                  'gameDate', 
                  'homeTeamID',
                  'awayTeamID', 
                  'homeTeamGameNumber',
                  'awayTeamGameNumber', 
                  'gameType_y',
                  'teamID', 
                  'gameNumber', 
                  'goalsForTotal',
                  'goalsAgainstTotal', 
                  'goalsForPerGame', 
                  'goalsAgainstPerGame',
                  'goalDifferential',
                  'winLoss', 
                  'winLossTotal', 
                  'winningPercentage'],axis=1)

In [183]:
gameOutcome['homeTeamWin'] = gameOutcome.apply(lambda row: 1 if row['goalsFor'] > row['goalsAgainst'] else 0,axis=1)

## Creating Model Input

In [128]:
teamDataCondensed

Unnamed: 0,seasonID,gameID,gameType,teamID,goalsForPerGame,goalsAgainstPerGame,goalDifferential,winningPercentage
0,20102011,2010020001,R,10,3.000000,2.000000,1.0,1.000000
1,20102011,2010020002,R,5,2.000000,3.000000,-1.0,0.000000
2,20102011,2010020003,R,30,3.000000,4.000000,-1.0,0.000000
3,20102011,2010020004,R,21,4.000000,3.000000,1.0,1.000000
4,20102011,2010020005,R,22,4.000000,0.000000,4.0,1.000000
...,...,...,...,...,...,...,...,...
25203,20202021,2020020864,R,20,2.821429,2.910714,-5.0,0.464286
25204,20202021,2020020865,R,24,2.285714,3.250000,-54.0,0.303571
25205,20202021,2020020866,R,19,3.071429,3.089286,-1.0,0.482143
25206,20202021,2020020867,R,21,3.535714,2.410714,63.0,0.696429


In [118]:
# # Merging team Data condensed with seasons filtered to determine of the team was home or away
# teamDataCondensed = pd.merge(teamDataCondensed, seasonsFiltered[['gameID','homeTeamID']], how='left', left_on=['gameID','teamID'], right_on=['gameID','homeTeamID'])
# # Creating a home/away indicator
# teamDataCondensed['isHomeTeam'] = teamDataCondensed['homeTeamID'].apply(lambda x: 0 if pd.isna(x) else 1)
# # Dropping useless column
# teamDataCondensed = teamDataCondensed.drop(['homeTeamID'],axis=1)

In [134]:
# Determing the game number for each time by season
teamDataCondensed = teamDataCondensed.sort_values(['seasonID','gameID'])
teamDataCondensed['gameNumber'] = teamDataCondensed.groupby(['seasonID','teamID']).cumcount()+1
teamDataCondensed['gameNumber'] += 1

In [139]:
# Sorting and then getting the home and away teams game numbers (used to join on below)
seasonsFiltered = seasonsFiltered.sort_values(['seasonID','gameID'])
seasonsFiltered['homeTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','homeTeamID']).cumcount()+1
seasonsFiltered['awayTeamGameNumber'] = seasonsFiltered.groupby(['seasonID','awayTeamID']).cumcount()+1

In [155]:
# Creating the model input with the home team data
modelInput = pd.merge(seasonsFiltered, 
            teamDataCondensed, 
            how='inner', 
            left_on=['seasonID', 'homeTeamID', 'homeTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'])

In [156]:
# Dropping extra columns
modelInput = modelInput.drop(['gameID_x', 
                 'gameType_x',
                 'homeTeamGameNumber',
                 'teamID'],
                axis=1)

In [157]:
# Renaming columns appropriately for the home team (to differentiate from the away team later)
modelInput = modelInput.rename({'gameID_y':'gameID',
                   'gameType_y':'gameType',
                   'goalsForPerGame':'goalsForPerGameHome',
                   'goalsAgainstPerGame':'goalsAgainstPerGameHome', 
                   'goalDifferential':'goalDifferentialHome', 
                   'winningPercentage':'winningPercentageHome'},axis=1)

In [158]:
# adding the away team data to the model input
modelInput = pd.merge(modelInput, 
            teamDataCondensed, 
            how='inner', 
            left_on=['seasonID', 'awayTeamID', 'awayTeamGameNumber'],
            right_on=['seasonID','teamID','gameNumber'])

In [160]:
# Dropping extra columns
modelInput = modelInput.drop(['gameID_x', 
                 'gameType_x',
                 'awayTeamGameNumber',
                 'teamID',
                 'gameNumber_x',
                'gameNumber_y'],
                axis=1)

In [165]:
# Renaming columns to differentiate from the home team
modelInput = modelInput.rename({'gameID_y':'gameID',
                   'gameType_y':'gameType', 
                   'goalsForPerGame':'goalsForPerGameAway',
                   'goalsAgainstPerGame':'goalsAgainstPerGameAway',
                   'goalDifferential':'goalDifferentialAway', 
                   'winningPercentage':'winningPercentageAway'},
                 axis=1)

In [192]:
# Merging the game output
modelInput = pd.merge(modelInput, 
         gameOutcome, 
         how='inner',
         on='gameID',
         suffixes = ("","_y")).drop(['seasonID_y','goalsFor','goalsAgainst'],axis=1)

In [193]:
modelInput

Unnamed: 0,seasonID,gameDate,homeTeamID,awayTeamID,goalsForPerGameHome,goalsAgainstPerGameHome,goalDifferentialHome,winningPercentageHome,gameID,gameType,goalsForPerGameAway,goalsAgainstPerGameAway,goalDifferentialAway,winningPercentageAway,homeTeamWin
0,20102011,2010-10-09,5,8,2.000000,3.000000,-1.0,0.000000,2010020001,R,2.761905,2.380952,16.0,0.571429,1
1,20102011,2010-10-11,2,3,4.000000,5.000000,-1.0,0.000000,2010020013,R,3.119048,2.452381,28.0,0.476190,0
2,20102011,2010-10-20,11,7,4.000000,2.000000,2.0,1.000000,2010020013,R,3.000000,6.000000,-3.0,0.000000,0
3,20102011,2010-10-11,19,24,0.000000,0.000000,0.0,0.000000,2010020020,R,2.883721,2.604651,12.0,0.581395,1
4,20102011,2010-10-23,25,18,3.093023,2.627907,20.0,0.558140,2010020020,R,4.000000,1.000000,3.0,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12153,20202021,2021-05-08,5,7,4.000000,2.785714,17.0,0.785714,2020020416,R,1.769231,3.692308,-25.0,0.153846,0
12154,20202021,2021-05-10,16,25,3.846154,3.153846,9.0,0.538462,2020020409,R,3.466667,2.666667,12.0,0.400000,1
12155,20202021,2021-05-08,18,12,2.853659,2.804878,2.0,0.560976,2020020442,R,3.441860,2.465116,42.0,0.697674,0
12156,20202021,2021-05-08,10,8,3.428571,2.500000,13.0,0.642857,2020020439,R,3.000000,2.977273,1.0,0.454545,1


## Creating the Model

In [197]:
# Creating the x and y data
x = modelInput[['goalsForPerGameHome', 
                     'goalsAgainstPerGameHome', 
                     'goalDifferentialHome',
                     'winningPercentageHome',
                     'goalsForPerGameAway',
                     'goalsAgainstPerGameAway',
                     'goalDifferentialAway',
                     'winningPercentageAway']]
y = modelInput[['homeTeamWin']]

In [202]:
# Normalizing the data
scaler = preprocessing.StandardScaler().fit(x)
xScaled = scaler.transform(x)

In [203]:
xTrain, xTest, yTrain, yTest = train_test_split(xScaled, y, test_size=0.2,random_state=109) # 70% training and 20% test

### Support Vector Machine

In [262]:
clf = svm.SVC(kernel='sigmoid',gamma='auto') # Linear Kernel

In [263]:
clf.fit(xTrain, yTrain)

  return f(*args, **kwargs)


SVC(gamma='auto', kernel='sigmoid')

In [264]:
yPred = clf.predict(xTest)

In [265]:
print("Accuracy:",metrics.accuracy_score(yTest, yPred))

Accuracy: 0.5394736842105263


In [241]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, xScaled, y, cv=5)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [242]:
scores

array([0.62294408, 0.6328125 , 0.59786184, 0.61620732, 0.6137392 ])