In [30]:
import pandas as pd
from SQLCode import DatabaseConnection
from SQLCode import DatabaseCredentials as DBC
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer

from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

In [31]:
# Opening connection
creds = DBC.DataBaseCredentials()
conn = DatabaseConnection.sql_connection(creds.server, creds.database, creds.user, creds.password)
connection = conn.open()
cursor = connection.cursor()

In [32]:
liveFeed = pd.read_sql_query("select * from live_feed where gameID = 2020030312", connection)

In [33]:
# Getting the seasons data
seasons = pd.read_sql_query("select * from schedules", connection)

In [34]:
boxscores = pd.read_sql_query("select gameID, teamID, playerID from box_scores where gameID = 2020030312", connection)
# boxscores = pd.read_sql_query("select gameID, teamID, playerID from box_scores where gameID =2020030312", connection)

# Creating Model Input

In [35]:
# Filtering to regular seasons games and 20102011 onwards (when live data started)
seasonsFiltered = seasons[seasons['seasonID'] >= 20102011]
seasonsFiltered = seasonsFiltered[seasonsFiltered['gameType'] == 'R']

In [36]:
# Getting and filtering the raw data
rawData = pd.merge(liveFeed,seasons, how='left',on='gameID')
rawData = rawData[rawData['seasonID'] >= 20102011]
# rawData = rawData[rawData['gameType'] == 'R']
rawData = rawData[rawData['eventSubID'] == 0]
rawData = rawData[['eventID',
         'gameID',
         'eventTypeID',
         'eventDescription',
         'periodNum',
         'periodTime',
         'xCoordinate',
         'yCoordinate',
         'teamID','homeTeamID','awayTeamID','playerID','penaltyMinutes']]

In [37]:
# Merging box score data
rawData = pd.merge(rawData, 
                   boxscores, 
                   how='left', 
                   left_on=['gameID', 'playerID'], 
                   right_on=['gameID', 'playerID'],
                   suffixes=('', '_box'))

In [38]:
# Getting only the desired events
rawDataFiltered = rawData[rawData['eventTypeID'].isin(['FACEOFF',
                             'SHOT',
                             'MISSED_SHOT',
                             'BLOCKED_SHOT',
                             'TAKEAWAY',
                             'GIVEAWAY',
                             'HIT',
                             'GOAL',
                             'PERIOD_END',
                             'EARLY_INT_START',
                             'PENALTY',
                             'STOP',
                             'EARLY_INT_END',
                             'EARLY_INT_END'])]
# rawDataFiltered = rawDataFiltered[rawDataFiltered['playerType'].isin(['NULL', 
#                                    'Winner', 
#                                    'Loser', 
#                                    'Hitter', 
#                                    'PlayerID','Shooter','Blocker','Unknown','Scorer','PenaltyOn','DrewBy'])]
# Sorting
rawDataFiltered = rawDataFiltered.sort_values(by=['gameID', 'eventID'])

In [39]:
# Converting period number to numeric (seems to be read in as a string)
rawDataFiltered['periodNum'] = pd.to_numeric(rawDataFiltered['periodNum'])

# Filtering to only the 1st threee period for consistency
rawDataFiltered = rawDataFiltered[rawDataFiltered['periodNum'] <= 3]

In [40]:
def team_type(teamID, homeTeamID, awayTeamID):
    if teamID == homeTeamID:
        return 'HOME'
    elif teamID == awayTeamID:
        return 'AWAY'
    else:
        return np.nan

def coordinate_normalization(xCoord, teamType, periodNum):
    if teamType == np.nan:
        return None
    else:
        if int(periodNum)%2 == 1:
            if teamType == 'AWAY':
                return xCoord
            else:
                return -1 * xCoord
        else:
            if teamType == 'AWAY':
                return xCoord
            else:
                return -1 * xCoord
            
def elapsed_seconds(periodNum, periodTime):
    periodTime = pd.Timedelta(periodTime)
    return (int(periodNum) - 1) * 20 * 60 + periodTime.total_seconds()/60

class Queue:
    #CITATION: https://runestone.academy/runestone/books/published/pythonds/BasicDS/ImplementingaQueueinPython.html
    def __init__(self):
        self.queue = []

    def isEmpty(self):
        return self.queue == []

    def enqueue(self, item):
        self.queue.insert(0,item)

    def dequeue(self):
        return self.queue.pop()

    def size(self):
        return len(self.queue)
    
    def get_queue(self):
        return self.queue

    def exchange(self, oldItem, newItem):
        self.queue[self.queue.index(oldItem)] = newItem
        
    def remove(self, item):
        self.queue.remove(item)

In [41]:
rawDataFiltered['teamType'] = rawDataFiltered.apply(lambda row: team_type(row['teamID'], 
                                                                          row['homeTeamID'], 
                                                                          row['awayTeamID']) ,axis=1)

In [42]:
rawDataFiltered['xCoordinate'] = rawDataFiltered.apply(lambda row: coordinate_normalization(row['xCoordinate'], 
                                                           row['teamType'], 
                                                           row['periodNum']), axis=1)

In [43]:
rawDataFiltered['secondsElapsed'] = rawDataFiltered.apply(lambda row: elapsed_seconds(row['periodNum'], row['periodTime']), axis=1)

In [44]:
sequences = []
actionEvents = ['FACEOFF',
                'SHOT',
                'MISSED_SHOT',
                'BLOCKED_SHOT',
                'TAKEAWAY',
                'GIVEAWAY',
                'HIT',
                'GOAL']
startEndEvents = ['PERIOD_START',
                  'PERIOD_END',
                  'EARLY_INT_START',
                  'PENALTY',
                  'STOP',
                  'SHOOTOUT_COMPLETE',
                  'GAME_END',
                  'EARLY_INT_END',
                  'EARLY_INT_END']

sequenceNum = 0
eventNum = 0
context = {'goalDiff':0, 'manpowerDiff':0, 'periodNum':1}
penaltyQueue = Queue()
for index, row in rawDataFiltered.iterrows():
    if index % 100000 == 0:
        print((index/len(rawDataFiltered))*100, '%')
    # Computing if respective team is home or away
    teamType = team_type(row['teamID'], row['homeTeamID'], row['awayTeamID'])
        
    # Catching penalties to update the context
    if row['eventTypeID'] == 'PENALTY':
        if int(row['penaltyMinutes']) != 10:
            # Getting the end time of the penalty (in seconds)
            penaltyStart = elapsed_seconds(row['periodNum'],row['periodTime'])
            penaltyEnd = penaltyStart + row['penaltyMinutes'] * 60
            

            # Determining who took the penalty to update the context
            if teamType == 'HOME':
                context['manpowerDiff'] += 1
            else:
                context['manpowerDiff'] += -1

            # Enqueuing the penalty
            penaltyQueue.enqueue({'team':teamType, 
                                  'penaltyStart': penaltyStart, 
                                  'penaltyEnd':penaltyEnd, 
                                  'penaltyLength': 
                                  row['penaltyMinutes']})
#             print(penaltyQueue.get_queue())
#             print("***************************************************")
    else:
        # If there currently is a penalty in the penalty queue
        if penaltyQueue.size() > 0:
            # Determining the time of the action
            actionTime = elapsed_seconds(row['periodNum'],row['periodTime'])
            
            # Iterating through all  the penalties in the queue from oldest to newest
            for penalty in reversed(penaltyQueue.get_queue()):
                # If the action occured after the penalty ended, we update the context and pop the penalty
                if penalty['penaltyEnd'] < actionTime:
                    # Popping the penalty
                    penaltyQueue.remove(penalty)

                    # Updating the context
                    if penalty['team'] == 'HOME':
                        context['manpowerDiff'] -= 1
                    else:
                        context['manpowerDiff'] -= -1
            
        # If the event type is a goal
        if row['eventTypeID'] == 'GOAL':
            
            # Injecting the shot before the goal# Adding in the next sequency
            sequences.append([row['gameID'],  
                      context['goalDiff'],
                      context['manpowerDiff'],
                      context['periodNum'],
                      sequenceNum, 
                      eventNum, 
                      'SHOT',  # action type
                      teamType,  # Home/Away
#                       row['zone']
                      row['secondsElapsed'],
                          row['xCoordinate'],
                          row['yCoordinate']]) #Neutral/Offensive/Defensive/NULL
            # Incrementing the event number
            eventNum += 1
            
            # Updating the context
            if teamType == 'HOME':
                context['goalDiff'] -= 1
            else:
                context['goalDiff'] -= -1

            # Defining home/away flags to only pop off the minimum number of penalties
            FLAGS = {'HOME':True, 'AWAY':True}
            
            # Determining the time of the action
            actionTime = elapsed_seconds(row['periodNum'],row['periodTime'])
            
            # If there currently is a penalty in the penalty queue
            if penaltyQueue.size() > 0:
                
                # Iterating through all  the penalties in the queue from oldest to newest
                for penalty in reversed(penaltyQueue.get_queue()):
                    
                    # If the penalty is a 5 minute major, the player must serve the full 5 minutes (no change needed)
                    # If the penalty is over it would have been popped in the above if statement
                    if penalty['penaltyLength'] == 5:
                        continue
                    else:
                        
                        # Making sure its not a shorthanded goal and we haven't already popped a penalty for this goal/team
                        if (penalty['team'] != teamType) & (FLAGS[penalty['team']]):
                            
                            # Creating the updated penalty
                            newPenalty = penalty
                            newPenalty['penaltyStart'] = actionTime
                            newPenalty['penaltyLength'] += -120
                            newPenalty['penaltyEnd'] = newPenalty['penaltyStart'] + newPenalty['penaltyLength']
        
                            if penalty['penaltyLength'] <= 0:
                
                                # Popping the penalty off
                                penaltyQueue.remove(penalty)
                        
                                # Updating the context
                                if penalty['team'] == 'HOME':
                                    context['manpowerDiff'] -= 1
                                else:
                                    context['manpowerDiff'] -= -1
                            else:
                                
                                # replacing the old penalty info with the new one
                                penaltyQueue.exchange(penalty, newPenalty)
                                
                            FLAGS[penalty['team']] == False 
                            
    # Updating the context if needed
    if row['periodNum'] != context['periodNum']:
        context['periodNum'] = row['periodNum']
    
    # Adding in the next sequency
    if (row['eventTypeID'] not in startEndEvents) | (row['eventTypeID'] == 'PENALTY'):
        sequences.append([row['gameID'],  
                          context['goalDiff'],
                          context['manpowerDiff'],
                          context['periodNum'],
                          sequenceNum, 
                          eventNum, 
                          row['eventTypeID'],  # action type
                          teamType,  # Home/Away
                          row['secondsElapsed'],
                          row['xCoordinate'],
                          row['yCoordinate']]) #Neutral/Offensive/Defensive/NULL
    if row['eventTypeID'] in actionEvents:
        eventNum += 1
    else:
        sequenceNum += 1
        eventNum = 0
#     break
sequenceData = pd.DataFrame(sequences, 
                            columns=['gameID', 
                                     'goalDiff', 
                                     'manpowerDiff',
                                     'periodNum', 
                                     'sequenceNum',
                                     'eventNum',
                                     'event', 
                                     'team',
                                     'secondsElapsed',
                                     'xCoord',
                                     'yCoord'])

In [45]:
# Vectorizing the events
eventVectorizer = CountVectorizer()
eventVectorizer.fit(sequenceData['event'])
vectorizedEvents = eventVectorizer.transform(sequenceData['event']).toarray()

In [46]:
# List of actions in the correct order according to its position on the vectorized array
actions = [action[0] for action in sorted(eventVectorizer.vocabulary_.items())]

# Adding the vectorized events onto the sequence data
sequenceData = pd.concat([sequenceData,
                          pd.DataFrame(data=vectorizedEvents,
                                       columns=actions)],axis=1)

# Dropping event as it is no longer needed
sequenceData = sequenceData.drop(['event'],axis=1)

In [47]:
# Filling NAs
sequenceData['team'] = sequenceData['team'].fillna(value='neither')

In [48]:
# Vectorizing the teams
teamVectorizer = CountVectorizer()
teamVectorizer.fit(sequenceData['team'])
vectorizedTeams = teamVectorizer.transform(sequenceData['team']).toarray()

In [49]:
# List of teams in the correct order according to its position on the vectorized array
teams = [action[0] for action in sorted(teamVectorizer.vocabulary_.items())]

# Adding the vectorized events onto the sequence data
sequenceData = pd.concat([sequenceData,
                          pd.DataFrame(data=vectorizedTeams,
                                       columns=teams)],axis=1)

# Dropping event as it is no longer needed
sequenceData = sequenceData.drop(['team'],axis=1)

In [50]:
# Defining a sequence data object
sequenceDataTMinusOne = sequenceData.drop(['goalDiff', 'manpowerDiff', 'periodNum'],axis=1).copy(deep=True)

# Incrementing the eventnumber to use to join below
sequenceDataTMinusOne['eventNum'] -= 1

In [51]:
# Merging the time t and t+1 datasets
sequenceDataComplete = pd.merge(left=sequenceData, 
         right=sequenceDataTMinusOne, 
         how='left',
         left_on=['gameID', 'sequenceNum', 'eventNum'],
         right_on=['gameID', 'sequenceNum', 'eventNum'],
        suffixes=('','_TMinusOne'))

In [52]:
# Re-ordering columns
sequenceDataComplete = sequenceDataComplete[['gameID', 
                      'sequenceNum',
                      'eventNum', 
                      'goalDiff', 
                      'manpowerDiff',
                      'periodNum',
                      'secondsElapsed', 
                      'xCoord', 
                      'yCoord', 
                      'blocked_shot',
                      'faceoff', 
                      'giveaway', 
                      'goal', 
                      'hit',
                      'missed_shot',
                      'penalty', 
                      'shot',
                      'takeaway',
                      'away', 
                      'home',
                      'secondsElapsed_TMinusOne',
                      'xCoord_TMinusOne',
                      'yCoord_TMinusOne', 
                      'blocked_shot_TMinusOne',
                      'faceoff_TMinusOne', 
                      'giveaway_TMinusOne', 
                      'goal_TMinusOne',
                      'hit_TMinusOne', 
                      'missed_shot_TMinusOne',
                      'penalty_TMinusOne',
                      'shot_TMinusOne', 
                      'takeaway_TMinusOne', 
                      'away_TMinusOne',
                      'home_TMinusOne']]

In [77]:
sequenceDataComplete.columns

Index(['gameID', 'sequenceNum', 'eventNum', 'goalDiff', 'manpowerDiff',
       'periodNum', 'secondsElapsed', 'xCoord', 'yCoord', 'blocked_shot',
       'faceoff', 'giveaway', 'goal', 'hit', 'missed_shot', 'penalty', 'shot',
       'takeaway', 'away', 'home', 'secondsElapsed_TMinusOne',
       'xCoord_TMinusOne', 'yCoord_TMinusOne', 'blocked_shot_TMinusOne',
       'faceoff_TMinusOne', 'giveaway_TMinusOne', 'goal_TMinusOne',
       'hit_TMinusOne', 'missed_shot_TMinusOne', 'penalty_TMinusOne',
       'shot_TMinusOne', 'takeaway_TMinusOne', 'away_TMinusOne',
       'home_TMinusOne'],
      dtype='object')

# Model Creation
## Model Definition

In [121]:
# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

# convert a df to tensor to be used in pytorch
def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.values).float().to(device)

def custom_loss(modelInput, model, i):
    t = modelInput[0:i+1,:,3:20]
    tPlusOne = modelInput[0:i+1,:,[3,4,5,20,21,22,23,24,25,26,27,28,29,30,31,32,33]]
    # If goal scored by home team (if event is a goal, and team is home)
    if (modelInput[i,:,[2]] == 1) & (modelInput[i,:,[19]] == 1):
        R = torch.tensor([1,0,0]).to(get_device())
    # If goal scored by away team (if event is a goal, and team is away) 
    elif (modelInput[i,:,[2]] == 1) & (modelInput[i,:,[18]] == 1):
        R = torch.tensor([0,1,0]).to(get_device())
    # No one scored
    else:
        R = torch.tensor([0,0,1]).to(get_device())
        
    if torch.sum(torch.isnan(tPlusOne)) > 0:
        loss = R + model(t)
    else: 
        loss = R + model(tPlusOne) - model(t) 
    
    return torch.mean(torch.square(loss))

In [122]:
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.inputSize = 17
        self.numLSTMNodes = 1000
        self.numLSTMLayers = 1
        
        self.lstmLayer = nn.LSTM(input_size=self.inputSize, 
                                  hidden_size=self.numLSTMNodes,
                                  num_layers=self.numLSTMLayers, 
                                  bias=True, 
                                  dropout=0, 
                                  batch_first=True)
        self.hidden1 = nn.Linear(in_features=self.numLSTMNodes, out_features=1000)   
        self.hidden2 = nn.Linear(in_features=1000, out_features=1000)   
        self.hidden3 = nn.Linear(in_features=1000, out_features=1000)   
        self.hidden4 = nn.Linear(in_features=1000, out_features=1000)
        self.output = nn.Linear(in_features=1000, out_features=3)
        
    def forward(self, modelInput):
        [1, 1, 4]
        4, 1, 1000
        hidden = (
                    torch.cuda.FloatTensor(self.numLSTMLayers , 1, self.numLSTMNodes).normal_() ,
                    torch.cuda.FloatTensor(self.numLSTMLayers , 1, self.numLSTMNodes).normal_() 
                  )
        for sequence in modelInput:
            out, hidden = self.lstmLayer(sequence.view(1,1,-1), hidden)
#         t = out
        t = F.relu(out)
        t = F.relu(self.hidden1(t))
        t = F.relu(self.hidden2(t))
        t = F.relu(self.hidden3(t))
        t = F.relu(self.hidden4(t))
        t = F.softmax(self.output(t),dim=2)
        return t   

In [55]:
# Moving the df to GPU
dfGPU = df_to_tensor(sequenceDataComplete)

In [57]:
# Reshaping
# (batch size, sequence length, num features)
dfGPU = dfGPU.reshape((-1,1,34))

In [109]:
network.parameters()

<generator object Module.parameters at 0x0000016A092EA5F0>

In [125]:
network = DQN()
device = get_device()
network.to(device)

optimizer = optim.Adam(network.parameters(), lr=0.0001)
    
# Each Episode
for gameID in sequenceDataComplete['gameID'].unique():
    gameDataGPU = dfGPU[:,dfGPU[:,:,0][0,:] == gameID,:]
    gameData = sequenceDataComplete[sequenceDataComplete['gameID'] == gameID]
    for sequenceNum in sequenceDataComplete['sequenceNum'].unique():
        modelInput = gameDataGPU[gameDataGPU[:,:,1][:,0] == sequenceNum,:,:]
#         print(modelInput)
        i = 0
        while i <  modelInput.shape[0]:
            # calculate the loss
            loss = custom_loss(modelInput, network, i)
            i += 1

#             if sequenceNum > 0:
#                 break
#             print('**********************')


            # zero gradients
            optimizer.zero_grad()

            # perform backprop and update weights
            loss.backward()
            optimizer.step()

        

        