In [46]:
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix  

In [47]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

# Loading Data

In [48]:
GIM_VALUES = pd.read_sql_query("select * from stage_hockey.gim_values_consolidated",
                                  connection)

In [49]:
GIM_VALUES = GIM_VALUES.sort_values(['seasonID','gameID']) 

In [50]:
TEAM_STATS = pd.read_sql_query("select * from stage_hockey.game_prediction_team_stats",
                                  connection)

In [51]:
POSITION_AVERAGES = pd.read_sql_query("select * from stage_hockey.gim_position_averages_per_season",
                                  connection)

In [52]:
# Getting each players position
POSITIONS = pd.read_sql_query("""
select playerID,
             primaryPositionCode,
             row_number() over (partition by playerID order by date desc ) as 'ROW_NUM'
      from plays_position
      where primaryPositionCode is not null
""", connection)
POSITIONS = POSITIONS[POSITIONS['ROW_NUM'] == 1].drop(['ROW_NUM'], axis=1)

In [53]:
BOXSCORES = pd.read_sql_query("""
select bs.* from stage_hockey.boxscores bs
inner join schedules s on bs.gameID = s.gameID
where seasonID >= 20142015 and gameType in ('R','P')""",
                                  connection)

In [54]:
SCHEDULES = pd.read_sql_query("""
select s.*,
       ROW_NUMBER() over (PARTITION BY HOME_TEAMS.seasonID, HOME_TEAMS.teamID ORDER BY HOME_TEAMS.gameID) as 'homeTeamGameNumber',
        ROW_NUMBER() over (PARTITION BY AWAY_TEAMS.seasonID, AWAY_TEAMS.teamID ORDER BY AWAY_TEAMS.gameID) as 'awayTeamGameNumber'
from schedules s
inner join
    (
        select s.seasonID,
               gameID,
               homeTeamID as 'teamID',
               s.gameDate,
               s.gameType
        from schedules s
        where seasonID>=20142015 and
              gameType in ('R','P')
        union
        select seasonID,
               gameID,
               awayTeamID as 'teamID',
               s.gameDate,
               s.gameType
        from schedules s
        where seasonID>=20142015 and
              gameType in ('R','P')
     ) HOME_TEAMS ON HOME_TEAMS.teamID = s.homeTeamID and HOME_TEAMS.gameID = s.gameID
inner join
    (
        select s.seasonID,
               gameID,
               homeTeamID as 'teamID',
               s.gameDate,
               s.gameType
        from schedules s
        where seasonID>=20142015 and
              gameType in ('R','P')
        union
        select seasonID,
               gameID,
               awayTeamID as 'teamID',
               s.gameDate,
               s.gameType
        from schedules s
        where seasonID>=20142015 and
              gameType in ('R','P')
     ) AWAY_TEAMS ON AWAY_TEAMS.teamID = s.homeTeamID and AWAY_TEAMS.gameID = s.gameID
""",
                                  connection)

In [55]:
seasonMapping = pd.read_sql_query("select * from season_to_next_season_mapping",
                                  connection)

In [56]:
gameOutcome = pd.read_sql_query("""
select GOALS.gameID,
       homeTeamID,
       IFNULL(sum(case when teamID=homeTeamID then numGoals end), 0) as 'homeTeamGoals',
       awayTeamID,
       IFNULL(sum(case when teamID=awayTeamID then numGoals end), 0) as 'awayTeamGoals'
from
     (
         select count(*) as 'numGoals',
                teamID,
                s.homeTeamID,
                s.awayTeamID,
                s.gameID
         from live_feed lf
                  inner join schedules s on lf.gameID = s.gameID and lf.teamID
         where s.seasonID >= 20142015
           and eventTypeID = 'GOAL'
           and playerType = 'Scorer'
         group by lf.gameID, lf.teamID
     ) GOALS
group by gameID;
""",
                                  connection)

# Players Last Game

In [57]:
GIM = []
for index, player in BOXSCORES.iterrows():
    if index % 10000 == 0:
        print(int((index/len(BOXSCORES))*100),'%')
    previousGame = GIM_VALUES[(GIM_VALUES['gameID'] < player['gameID']) & (GIM_VALUES['playerID'] == player['playerID'])]
    
    if len(previousGame) == 0:
        GIM.append(np.nan)
    else:
        GIM.append(previousGame.iloc[-1,9])
BOXSCORES['GIM_VALUE'] = GIM

BOXSCORES = pd.merge(BOXSCORES, POSITIONS)
BOXSCORES = pd.merge(BOXSCORES,POSITION_AVERAGES)
BOXSCORES['GIM_VALUE'] = np.where(BOXSCORES['GIM_VALUE'].isna(),BOXSCORES['gimMean'], BOXSCORES['GIM_VALUE'])
BOXSCORES.to_csv('Data/boxscoreAndGIM_P.csv')
# BOXSCORES  = pd.read_csv('Data/boxscoreAndGIM_P.csv')

0 %
2 %
5 %
8 %
11 %
14 %
17 %
20 %
23 %
26 %
29 %
32 %
35 %
38 %
41 %
44 %
47 %
50 %
53 %
56 %
59 %
62 %
65 %
68 %
71 %
74 %
77 %
80 %
83 %
86 %
89 %
91 %
94 %
97 %


In [13]:
BOXSCORES

Unnamed: 0,teamID,playerID,scratched,gameID,GIM_VALUE,primaryPositionCode,seasonID,gimMean
0,8,8467334,0,2014020001,0.105070,C,20102011,0.230152
1,8,8467334,0,2014020001,0.105070,C,20112012,0.224092
2,8,8467334,0,2014020001,0.105070,C,20122013,0.228974
3,8,8467334,0,2014020001,0.105070,C,20132014,0.233796
4,8,8467334,0,2014020001,0.105070,C,20142015,0.221565
...,...,...,...,...,...,...,...,...
3706533,7,8475920,0,2020020858,0.004342,G,20162017,0.005489
3706534,7,8475920,0,2020020858,0.004342,G,20172018,0.005927
3706535,7,8475920,0,2020020858,0.004342,G,20182019,0.006618
3706536,7,8475920,0,2020020858,0.004342,G,20192020,0.005003


# Teams Last Game

In [58]:
last_game = TEAM_STATS[['seasonID','teamID','gameNumber']].groupby(['seasonID','teamID']).max('gameNumber').reset_index()

In [59]:
seasonAverages = pd.merge(TEAM_STATS, last_game , how='inner')

In [60]:
seasonAverages = seasonAverages.drop(['gameDate','gameType','gameID','teamID','gameNumber'],axis=1)

In [61]:
seasonAverages = seasonAverages.groupby('seasonID').mean().reset_index()

In [62]:
seasonAverages = pd.merge(seasonAverages, seasonMapping, left_on=['seasonID'],right_on=['previousSeasonID'],suffixes=('', '_next')).drop(['previousSeasonID'],axis=1)

In [63]:
seasonAverages

Unnamed: 0,seasonID,shotsForPerGame,goalsForPerGame,shotsAgainstPerGame,goalsAgainstPerGame,winningPercentage,shotDifferential,goalDifferential,seasonID_next
0,20102011,27.874573,2.84924,27.94943,2.87612,0.494833,0.0,0.0,20112012
1,20112012,27.309363,2.796997,27.358173,2.816203,0.496113,0.0,0.0,20122013
2,20122013,26.861273,2.765923,27.026983,2.813113,0.490207,0.0,0.0,20132014
3,20132014,27.587077,2.829033,27.70128,2.855997,0.494813,0.0,0.0,20142015
4,20142015,27.513037,2.78518,27.585227,2.8118,0.49424,0.0,0.0,20152016
5,20152016,27.20629,2.74543,27.28306,2.768497,0.4952,0.0,0.0,20162017
6,20162017,27.70199,2.788097,27.72601,2.812043,0.495213,0.0,0.0,20172018
7,20172018,29.151135,3.004794,29.151735,3.033152,0.494032,0.0,0.0,20182019
8,20182019,28.56811,3.030623,28.677171,3.053758,0.495855,0.0,0.0,20192020
9,20192020,28.547642,3.027648,28.589181,3.054826,0.494987,0.0,0.0,20202021


In [64]:
seasonAverages = seasonAverages[['seasonID_next','shotsForPerGame', 'goalsForPerGame', 'shotsAgainstPerGame',
       'goalsAgainstPerGame', 'winningPercentage', 'shotDifferential',
       'goalDifferential']]

In [65]:
TEAM_STATS

Unnamed: 0,seasonID,gameDate,gameType,gameID,teamID,shotsForPerGame,goalsForPerGame,shotsAgainstPerGame,goalsAgainstPerGame,winningPercentage,shotDifferential,goalDifferential,gameNumber
0,20102011,2010-10-07 23:00:00,R,2010020001,8,26.0000,2.0000,21.0000,3.0000,0.0000,5.0,-1.0,1
1,20102011,2010-10-07 23:00:00,R,2010020001,10,21.0000,3.0000,26.0000,2.0000,1.0000,-5.0,1.0,1
2,20102011,2010-10-07 23:00:00,R,2010020002,4,24.0000,3.0000,29.0000,2.0000,1.0000,-5.0,1.0,1
3,20102011,2010-10-07 23:00:00,R,2010020002,5,29.0000,2.0000,24.0000,3.0000,0.0000,5.0,-1.0,1
4,20102011,2010-10-07 16:00:00,R,2010020003,12,27.0000,4.0000,26.0000,3.0000,1.0000,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27133,20202021,2021-07-03 00:00:00,P,2020030413,14,26.7792,3.3377,26.7662,2.5065,0.6623,1.0,64.0,77
27134,20202021,2021-07-06 00:00:00,P,2020030414,8,28.0390,2.7532,26.1948,2.9091,0.4805,142.0,-12.0,77
27135,20202021,2021-07-06 00:00:00,P,2020030414,14,26.8462,3.3205,26.6538,2.5128,0.6538,15.0,63.0,78
27136,20202021,2021-07-08 00:00:00,P,2020030415,8,27.9615,2.7179,26.2308,2.8846,0.4744,135.0,-13.0,78


In [66]:
teamPredictors = []
for index, game in SCHEDULES.iterrows():
    if index % 1000 == 0:
        print(int((index/len(SCHEDULES))*100),'%')
    if game['gameType'] == 'R':
        homeTeam = TEAM_STATS[(TEAM_STATS['gameID'] < game['gameID']) & (TEAM_STATS['teamID'] == game['homeTeamID']) &(TEAM_STATS['gameType'] == 'R')]
        awayTeam = TEAM_STATS[(TEAM_STATS['gameID'] < game['gameID']) & (TEAM_STATS['teamID'] == game['awayTeamID']) &(TEAM_STATS['gameType'] == 'R')]
    else:
        homeTeam = TEAM_STATS[(TEAM_STATS['gameID'] < game['gameID']) & (TEAM_STATS['teamID'] == game['homeTeamID'])]
        awayTeam = TEAM_STATS[(TEAM_STATS['gameID'] < game['gameID']) & (TEAM_STATS['teamID'] == game['awayTeamID'])]


    if len(homeTeam) == 0:
        homeTeam = seasonAverages[seasonAverages['seasonID_next'] == game['seasonID']].values[0][1:]
    else:
        homeTeam = homeTeam.iloc[-1:].values[0][5:12]
    if len(awayTeam) == 0:
        awayTeam = seasonAverages[seasonAverages['seasonID_next'] == game['seasonID']].values[0][1:]
    else:
        awayTeam = awayTeam.iloc[-1:].values[0][5:12]
    homeTeam = np.append(homeTeam, game['homeTeamGameNumber'])
    awayTeam = np.append(awayTeam, game['awayTeamGameNumber'])
    
    teamPredictors.append(list(homeTeam) + list(awayTeam))  
 

0 %
11 %
22 %
33 %
45 %
56 %
67 %
79 %
90 %


In [67]:
modelInput = pd.DataFrame(teamPredictors, columns=['home_shotsForPerGame', 
                      'home_goalsForPerGame',
                      'home_shotsAgainstPerGame',
                      'home_goalsAgainstPerGame', 
                      'home_winningPercentage', 
                      'home_shotDifferential',
                      'home_goalDifferential',
                      'home_gameNumber',
                      'away_shotsForPerGame', 
                      'away_goalsForPerGame',
                      'away_shotsAgainstPerGame',
                      'away_goalsAgainstPerGame', 
                      'away_winningPercentage', 
                      'away_shotDifferential',
                      'away_goalDifferential',
                      'away_gameNumber'])

In [68]:
modelInput['gameID'] = SCHEDULES['gameID']
modelInput['homeTeamID'] = SCHEDULES['homeTeamID']
modelInput['awayTeamID'] = SCHEDULES['awayTeamID']

In [69]:
teamGIM = BOXSCORES[['teamID','gameID','GIM_VALUE']].groupby(['teamID','gameID']).sum().reset_index()

In [70]:
modelInput = pd.merge(modelInput,teamGIM,how='inner',left_on=['gameID','homeTeamID'],right_on=['gameID','teamID'],suffixes=('', '_home'))

In [71]:
modelInput = modelInput.drop(['teamID'],axis=1).rename({'GIM_VALUE':'home_GIM_VALUE'},axis=1)

In [72]:
modelInput = pd.merge(modelInput,teamGIM,how='inner',left_on=['gameID','awayTeamID'],right_on=['gameID','teamID'],suffixes=('', '_away'))

In [73]:
modelInput = modelInput.drop(['teamID'],axis=1).rename({'GIM_VALUE':'away_GIM_VALUE'},axis=1)

In [74]:
modelInput = modelInput[['gameID', 'homeTeamID', 'awayTeamID','home_shotsForPerGame', 'home_goalsForPerGame',
       'home_shotsAgainstPerGame', 'home_goalsAgainstPerGame',
       'home_winningPercentage', 'home_shotDifferential',
       'home_goalDifferential', 'home_gameNumber', 'home_GIM_VALUE','away_shotsForPerGame',
       'away_goalsForPerGame', 'away_shotsAgainstPerGame',
       'away_goalsAgainstPerGame', 'away_winningPercentage',
       'away_shotDifferential', 'away_goalDifferential', 'away_gameNumber',
       'away_GIM_VALUE']]

In [75]:
is_NaN = modelInput.isnull()

row_has_NaN = is_NaN.any(axis=1)

rows_with_NaN = modelInput[row_has_NaN]
rows_with_NaN

Unnamed: 0,gameID,homeTeamID,awayTeamID,home_shotsForPerGame,home_goalsForPerGame,home_shotsAgainstPerGame,home_goalsAgainstPerGame,home_winningPercentage,home_shotDifferential,home_goalDifferential,...,home_GIM_VALUE,away_shotsForPerGame,away_goalsForPerGame,away_shotsAgainstPerGame,away_goalsAgainstPerGame,away_winningPercentage,away_shotDifferential,away_goalDifferential,away_gameNumber,away_GIM_VALUE


In [76]:
gameOutcome['winner'] = np.where(gameOutcome['homeTeamGoals'] > gameOutcome['awayTeamGoals'],1,0)

In [77]:
modelInput = pd.merge(modelInput,gameOutcome[['gameID','winner']])

In [78]:
conn.close()

In [79]:
modelInput.to_csv('Data/gameOutComeModelInput_P.csv')

In [None]:
modelInput = pd.read_csv('Data/gameOutComeModelInput_P.csv',index_col=0)

In [None]:
modelInput = modelInput[(modelInput['away_gameNumber'] >= 20) & (modelInput['home_gameNumber'] >= 20) ]

# Modelling

In [81]:
x = modelInput.drop(['gameID','homeTeamID','awayTeamID','winner'],axis=1)
y = modelInput['winner']

In [82]:
# Normalizing the data
scaler = preprocessing.StandardScaler().fit(x)
xScaled = scaler.transform(x)

In [None]:
from pickle import dump
dump(scaler, open('game_out_prediction_scaler.pkl', 'wb'))

In [83]:
xTrain, xTest, yTrain, yTest = train_test_split(xScaled, y, test_size=0.2,random_state=109) # 70% training and 20% test

## SVM

In [None]:
clf = svm.SVC(kernel='linear',gamma=1, C=0.1)
clf.fit(xTrain, yTrain.values)
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

In [84]:
clf = svm.SVC(kernel='sigmoid',gamma=0.001, C=10)
clf.fit(xTrain, yTrain.values)
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

Train Accuracy: 0.5766237441630112
Test Accuracy: 0.5631013016411998


In [85]:
modelInputPreds = clf.predict(xScaled)

In [94]:
modelInput[modelInput['gameID'] ==2014020465]

Unnamed: 0,gameID,homeTeamID,awayTeamID,home_shotsForPerGame,home_goalsForPerGame,home_shotsAgainstPerGame,home_goalsAgainstPerGame,home_winningPercentage,home_shotDifferential,home_goalDifferential,...,away_goalsForPerGame,away_shotsAgainstPerGame,away_goalsAgainstPerGame,away_winningPercentage,away_shotDifferential,away_goalDifferential,away_gameNumber,away_GIM_VALUE,winner,predictedWinner
1202,2014020465,30,6,29.4828,2.9655,23.5172,2.6552,0.5517,173.0,9.0,...,2.5806,27.2581,2.6452,0.4839,23.0,-2.0,14.0,40.029576,0,1


In [86]:
modelInput['predictedWinner'] = modelInputPreds

In [80]:
modelInput[modelInput['gameID'] == 2020030415]

Unnamed: 0,gameID,homeTeamID,awayTeamID,home_shotsForPerGame,home_goalsForPerGame,home_shotsAgainstPerGame,home_goalsAgainstPerGame,home_winningPercentage,home_shotDifferential,home_goalDifferential,...,away_shotsForPerGame,away_goalsForPerGame,away_shotsAgainstPerGame,away_goalsAgainstPerGame,away_winningPercentage,away_shotDifferential,away_goalDifferential,away_gameNumber,away_GIM_VALUE,winner
8297,2020030415,14,8,26.8462,3.3205,26.6538,2.5128,0.6538,15.0,63.0,...,28.039,2.7532,26.1948,2.9091,0.4805,142.0,-12.0,40.0,33.388916,1


In [87]:
modelInput[['gameID','homeTeamID','awayTeamID','predictedWinner']].to_csv('game_outcome_predictions.csv')

In [None]:
dump(clf, open('game_outcome_predictino_svm.sav', 'wb'))

In [None]:
clf = svm.SVC(kernel='rbf',gamma=0.01, C=1)
clf.fit(xTrain, yTrain.values)
yPred = clf.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = clf.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

In [None]:
# clf = svm.SVC(kernel='poly',gamma=0.01, C=1)
# clf.fit(xTrain, yTrain.values)
# yPred = clf.predict(xTrain)
# print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
# yPred = clf.predict(xTest)
# print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

## Random Forest

In [None]:
# make predictions using random forest for classification
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

In [None]:
# define the model
model = RandomForestClassifier(n_estimators = 75, 
                               criterion="entropy",
                               max_depth=75,
                               min_samples_split=0.1,
                               min_samples_leaf=0.05)
# fit the model on the whole dataset
model.fit(xTrain, yTrain)

yPred = model.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = model.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

In [None]:
# define the model
model = RandomForestClassifier(n_estimators = 50, 
                               criterion="gini",
                               max_depth=75,
                               min_samples_split=0.01,
                              min_samples_leaf=0.05)
# fit the model on the whole dataset
model.fit(xTrain, yTrain)

yPred = model.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = model.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

## XGBoost

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
param_grid = {'scale_pos_weight':[1],
                          'learning_rate':[0.001,],  
                          'colsample_bytree' : [0.1,.2,.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                          'subsample' : [1,],
                          'objective':['binary:logistic'], 
                          'n_estimators':[1000,1500], 
                          'reg_alpha' : [0.3],
                          'max_depth':[4], 
                          'gamma':[0,1,5,10],'eval_metric':["error"]}

In [None]:
grid = GridSearchCV(XGBClassifier(), param_grid, refit=True, verbose=1,n_jobs=-1,cv=2)
grid.fit(xTrain, yTrain)

In [None]:
print(grid.best_estimator_)

In [None]:
model = XGBClassifier(silent=True, 
                      scale_pos_weight=1,
                      learning_rate=0.001,  
                      colsample_bytree = 0.4,
                      subsample = 1,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

eval_metric = ["error"]

# fit the model
model.fit(xTrain, yTrain)

In [None]:
yPred = model.predict(xTrain)
print("Train Accuracy:",metrics.accuracy_score(yTrain, yPred))
yPred = model.predict(xTest)
print("Test Accuracy:",metrics.accuracy_score(yTest, yPred))

In [None]:
# retrieve performance metrics
results = model.evals_result()

In [None]:
# plot learning curves
plt.plot(results['validation_0']['error'], label='train')
plt.plot(results['validation_1']['error'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
plt.figure(figsize=(20,15))
xgb.plot_importance(model, ax=plt.gca())

In [None]:
modelInput.columns