In [3]:
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import numpy as np

In [4]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [5]:
# Getting the live_feed data
liveFeed = pd.read_sql_query("select * from live_feed where gameID >= 20102011", connection)

In [6]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [7]:
boxscores = pd.read_sql_query("select gameID, teamID, playerID from box_scores where gameID >= 2010010001", connection)

In [8]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [9]:
# Getting and filtering the raw data
rawData = pd.merge(liveFeed,seasons, how='right',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
# rawData = rawData[rawData['gameType'] == 'R']

In [10]:
# Merging box score data
rawData = pd.merge(rawData, 
                   boxscores, 
                   how='left', 
                   left_on=['gameID', 'playerID'], 
                   right_on=['gameID', 'playerID'],
                   suffixes=('', '_box'))

In [43]:
# Filtering to test game
rawData = rawData[rawData['gameID'] == 2020030312]

In [44]:
# Getting only the desired events
rawDataFiltered = rawData[rawData['eventTypeID'].isin(['FACEOFF',
                             'SHOT',
                             'MISSED_SHOT',
                             'BLOCKED_SHOT',
                             'TAKEAWAY',
                             'GIVEAWAY',
                             'HIT',
                             'GOAL',
                             'PERIOD_START',
                             'PERIOD_END',
                             'EARLY_INT_START',
                             'PENALTY',
                             'STOP',
                             'SHOOTOUT_COMPLETE',
                             'GAME_END',
                             'EARLY_INT_END',
                             'EARLY_INT_END'])]
# rawDataFiltered = rawDataFiltered[rawDataFiltered['playerType'].isin(['NULL', 
#                                    'Winner', 
#                                    'Loser', 
#                                    'Hitter', 
#                                    'PlayerID','Shooter','Blocker','Unknown','Scorer','PenaltyOn','DrewBy'])]
# Sorting
rawDataFiltered = rawDataFiltered.sort_values(by=['gameID', 'periodNum','periodTime'])

In [45]:
def team_type(teamID, homeTeamID, awayTeamID):
    if teamID == homeTeamID:
        return 'HOME'
    elif teamID == awayTeamID:
        return 'AWAY'
    else:
        return np.nan

def zone(xCoord, teamType, periodNum):
    if teamType == np.nan:
        return 'NEUTRAL'
    if (xCoord >= -25) & (xCoord < 25):
        return 'NEUTRAL'
    else:
        if int(periodNum)%2 == 1:
            if (xCoord > 25) & (teamType == 'AWAY'):
                return 'OFFENSIVE'
            elif (xCoord < -25) & (teamType == 'AWAY'):
                return 'DEFENSIVE'
            elif (xCoord > 25) & (teamType == 'HOME'):
                return 'DEFENSIVE'
            elif (xCoord < -25) & (teamType == 'HOME'):
                return 'OFFENSIVE'
        else:
            if (xCoord > 25) & (teamType == 'AWAY'):
                return 'DEFENSIVE'
            elif (xCoord < -25) & (teamType == 'AWAY'):
                return 'OFFENSIVE'
            elif (xCoord > 25) & (teamType == 'HOME'):
                return 'OFFENSIVE'
            elif (xCoord < -25) & (teamType == 'HOME'):
                return 'DEFENSIVE'

In [46]:
rawDataFiltered['zone'] = rawDataFiltered.apply(lambda row: zone(row['xCoordinate'],
                                       team_type(row['teamID'], 
                                                 row['homeTeamID'], 
                                                 row['awayTeamID']), row['periodNum']) ,axis=1)

In [47]:
rawDataFiltered

Unnamed: 0,eventID,eventSubID,gameID,event,eventCode,eventTypeID,eventDescription,secondaryType,periodNum,periodTime,...,strength,gameWinningGoal,emptyNetGoal,seasonID,gameType,gameDate,homeTeamID,awayTeamID,teamID_box,zone
7687157,8.0,0.0,2020030312,Period Start,VGK8,PERIOD_START,Period Start,,1,0 days 00:00:00,...,,,,20202021,P,2021-06-16,54,8,,
7687223,51.0,0.0,2020030312,Faceoff,VGK51,FACEOFF,Phillip Danault faceoff won against William Ka...,,1,0 days 00:00:00,...,,,,20202021,P,2021-06-16,54,8,8.0,NEUTRAL
7687224,51.0,1.0,2020030312,Faceoff,VGK51,FACEOFF,Phillip Danault faceoff won against William Ka...,,1,0 days 00:00:00,...,,,,20202021,P,2021-06-16,54,8,54.0,NEUTRAL
7687158,9.0,0.0,2020030312,Stoppage,VGK9,STOP,Icing,,1,0 days 00:11:00,...,,,,20202021,P,2021-06-16,54,8,,
7687225,52.0,0.0,2020030312,Faceoff,VGK52,FACEOFF,Phillip Danault faceoff won against William Ka...,,1,0 days 00:11:00,...,,,,20202021,P,2021-06-16,54,8,8.0,OFFENSIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687786,776.0,1.0,2020030312,Missed Shot,VGK776,MISSED_SHOT,Alec Martinez Wide of Net Carey Price,,3,0 days 19:49:00,...,,,,20202021,P,2021-06-16,54,8,8.0,OFFENSIVE
7687787,777.0,0.0,2020030312,Blocked Shot,VGK777,BLOCKED_SHOT,Alex Pietrangelo shot blocked shot by Shea Weber,,3,0 days 19:56:00,...,,,,20202021,P,2021-06-16,54,8,8.0,DEFENSIVE
7687788,777.0,1.0,2020030312,Blocked Shot,VGK777,BLOCKED_SHOT,Alex Pietrangelo shot blocked shot by Shea Weber,,3,0 days 19:56:00,...,,,,20202021,P,2021-06-16,54,8,54.0,DEFENSIVE
7687817,821.0,0.0,2020030312,Period End,VGK821,PERIOD_END,End of 3rd Period,,3,0 days 20:00:00,...,,,,20202021,P,2021-06-16,54,8,,


In [48]:
rawDataFiltered[np.where(pd.isna(rawDataFiltered['teamID']), 
         True, 
         np.where(rawDataFiltered['homeTeamID'] == rawDataFiltered['teamID_box'], True,False))]

Unnamed: 0,eventID,eventSubID,gameID,event,eventCode,eventTypeID,eventDescription,secondaryType,periodNum,periodTime,...,strength,gameWinningGoal,emptyNetGoal,seasonID,gameType,gameDate,homeTeamID,awayTeamID,teamID_box,zone
7687157,8.0,0.0,2020030312,Period Start,VGK8,PERIOD_START,Period Start,,1,0 days 00:00:00,...,,,,20202021,P,2021-06-16,54,8,,
7687224,51.0,1.0,2020030312,Faceoff,VGK51,FACEOFF,Phillip Danault faceoff won against William Ka...,,1,0 days 00:00:00,...,,,,20202021,P,2021-06-16,54,8,54.0,NEUTRAL
7687158,9.0,0.0,2020030312,Stoppage,VGK9,STOP,Icing,,1,0 days 00:11:00,...,,,,20202021,P,2021-06-16,54,8,,
7687226,52.0,1.0,2020030312,Faceoff,VGK52,FACEOFF,Phillip Danault faceoff won against William Ka...,,1,0 days 00:11:00,...,,,,20202021,P,2021-06-16,54,8,54.0,OFFENSIVE
7687227,53.0,0.0,2020030312,Blocked Shot,VGK53,BLOCKED_SHOT,Brendan Gallagher shot blocked shot by Alec Ma...,,1,0 days 00:16:00,...,,,,20202021,P,2021-06-16,54,8,54.0,DEFENSIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687816,820.0,0.0,2020030312,Stoppage,VGK820,STOP,Puck in Crowd,,3,0 days 19:39:00,...,,,,20202021,P,2021-06-16,54,8,,
7687785,776.0,0.0,2020030312,Missed Shot,VGK776,MISSED_SHOT,Alec Martinez Wide of Net Carey Price,,3,0 days 19:49:00,...,,,,20202021,P,2021-06-16,54,8,54.0,OFFENSIVE
7687788,777.0,1.0,2020030312,Blocked Shot,VGK777,BLOCKED_SHOT,Alex Pietrangelo shot blocked shot by Shea Weber,,3,0 days 19:56:00,...,,,,20202021,P,2021-06-16,54,8,54.0,DEFENSIVE
7687817,821.0,0.0,2020030312,Period End,VGK821,PERIOD_END,End of 3rd Period,,3,0 days 20:00:00,...,,,,20202021,P,2021-06-16,54,8,,


In [49]:
rawDataFilteredHome = rawDataFiltered[np.where(pd.isna(rawDataFiltered['teamID']), 
         True, 
         np.where(rawDataFiltered['teamID'] == rawDataFiltered['teamID_box'], True,False))]

rawDataFilteredAway = rawDataFiltered[np.where(pd.isna(rawDataFiltered['teamID']), 
         True, 
         np.where(rawDataFiltered['teamID'] == rawDataFiltered['teamID_box'], True,False))]

In [40]:
sequences = []
actionEvents = ['FACEOFF',
                'SHOT',
                'MISSED_SHOT',
                'BLOCKED_SHOT',
                'TAKEAWAY',
                'GIVEAWAY',
                'HIT',
                'GOAL']
startEndEvents = ['PERIOD_START',
                  'PERIOD_END',
                  'EARLY_INT_START',
                  'PENALTY',
                  'STOP',
                  'SHOOTOUT_COMPLETE',
                  'GAME_END',
                  'EARLY_INT_END',
                  'EARLY_INT_END']
sequenceNum = 0
eventNum = 0
for index, row in rawDataFilteredHome.iterrows():
    sequences.append([row['gameID'], 
                      row['periodNum'], 
                      sequenceNum, 
                      eventNum, 
                      row['eventTypeID'], 
                      team_type(row['teamID'], row['homeTeamID'], row['awayTeamID']), 
                      row['zone']])
    if row['eventTypeID'] in actionEvents:
        eventNum += 1
    else:
        sequenceNum += 1
        eventNum = 0
sequenceDataHome = pd.DataFrame(sequences, columns=['gameID', 'periodNum', 'sequenceNum',' eventNum','event', 'team', 'zone'])

In [41]:
sequences = []
actionEvents = ['FACEOFF',
                'SHOT',
                'MISSED_SHOT',
                'BLOCKED_SHOT',
                'TAKEAWAY',
                'GIVEAWAY',
                'HIT',
                'GOAL']
startEndEvents = ['PERIOD_START',
                  'PERIOD_END',
                  'EARLY_INT_START',
                  'PENALTY',
                  'STOP',
                  'SHOOTOUT_COMPLETE',
                  'GAME_END',
                  'EARLY_INT_END',
                  'EARLY_INT_END']
sequenceNum = 0
eventNum = 0
for index, row in rawDataFilteredAway.iterrows():
    sequences.append([row['gameID'], 
                      row['periodNum'], 
                      sequenceNum, 
                      eventNum, 
                      row['eventTypeID'], 
                      team_type(row['teamID'], row['homeTeamID'], row['awayTeamID']), 
                      row['zone']])
    if row['eventTypeID'] in actionEvents:
        eventNum += 1
    else:
        sequenceNum += 1
        eventNum = 0
sequenceDataAway = pd.DataFrame(sequences, columns=['gameID', 'periodNum', 'sequenceNum',' eventNum','event', 'team', 'zone'])

In [39]:
sequenceDataAway

Unnamed: 0,gameID,periodNum,sequenceNum,eventNum,event,event.1,zone
0,2020030312,1,0,0,PERIOD_START,,
1,2020030312,1,1,0,FACEOFF,AWAY,NEUTRAL
2,2020030312,1,1,1,STOP,,
3,2020030312,1,2,0,FACEOFF,AWAY,OFFENSIVE
4,2020030312,1,2,1,BLOCKED_SHOT,,
...,...,...,...,...,...,...,...
374,2020030312,3,57,3,STOP,,
375,2020030312,3,58,0,MISSED_SHOT,,
376,2020030312,3,58,1,BLOCKED_SHOT,AWAY,DEFENSIVE
377,2020030312,3,58,2,PERIOD_END,,
