# LSTM and GRU Implementations

The following notebook is used to implement the RNN model of LSTM and GRU on our training data. We will be training the data by sequences of individual players on a given play.

In [0]:
#Run the model prep notebok
%run /Workspace/Repos/anthony.m.quagliata@vanderbilt.edu/NFL-Capstone/03-Models/Model_prep

In [0]:
%run /Workspace/Repos/anthony.m.quagliata@vanderbilt.edu/NFL-Capstone/03-Models/Model_Evaluation_Functions

In [0]:
#import libraries
import pandas as pd
import numpy as np
import torch
import warnings
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import time

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [0]:
# Define the directory where your data is located
directory = "/dbfs/mnt/nfl/"

# Read in all your datasets
games = pd.read_csv(f"{directory}games.csv")
players = pd.read_csv(f"{directory}players.csv")
plays = pd.read_csv(f"{directory}plays.csv")
tackles = pd.read_csv(f"{directory}tackles.csv")
train = pd.read_csv(f"{directory}train_sample.csv")
val = pd.read_csv(f"{directory}val_sample.csv")

In [0]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(out)
        out = self.sigmoid(out)
        out = out.view(-1,77,142)
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

class LSTMNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob=0.2):
        super(LSTMNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, h):
        out, h = self.lstm(x, h)
        out = self.fc(self.relu(out[:,-1]))
        out = self.sigmoid(out)
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [0]:
def train_nn(train_loader, learn_rate = 0.01,batch_size=1000, hidden_dim=256, EPOCHS=5, model_type="GRU", num_classes = 2):
    
    #Define class weights
    class_counts = [0]*num_classes
    
    # Iterate through the training data to count class occurrences
    for _, labels in train_loader:
        for label in labels.view(-1, 142):
            for frame_label in label:
                class_counts[frame_label] += 1
            
    total_samples = sum(class_counts)
    class_weights = [total_samples/(num_classes*count) for count in class_counts]
    class_weights = torch.tensor(class_weights)
    
    print("Calculated training weights")
    print(class_weights)
    
    
    
    # Setting common hyperparameters
    input_dim = next(iter(train_loader))[0].shape[2]
    output_dim = next(iter(train_loader))[1].shape[1]
    n_layers = 2
    # Instantiating the models
    if model_type == "GRU":
        model = GRUNet(input_dim, hidden_dim, output_dim, n_layers)
    else:
        model = LSTMNet(input_dim, hidden_dim, output_dim, n_layers)
    model.to(device)
    
    # Defining loss function and optimizer
    criterion = nn.BCEWithLogitsLoss(pos_weight = class_weights[1])
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    
    model.train()
    print("Starting Training of {} model".format(model_type))
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.time()
        h = model.init_hidden(batch_size)
        avg_loss = 0.
        counter = 0
        for x, label in train_loader:
            counter += 1
            if model_type == "GRU":
                h = h.data
            else:
                h = tuple([e.data for e in h])
            model.zero_grad()
            
            out, h = model(x.to(device).float(), h)
            out = out.squeeze()
            labels = label.view(-1,1).to(device).float()
            loss = criterion(out, label.to(device).float())
            loss.mean()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if counter%200 == 0:
                print("Epoch {}......Step: {}/{}....... Average Loss for Epoch: {}".format(epoch, counter, len(train_loader), avg_loss/counter))
        current_time = time.time()
        print("Epoch {}/{} Done, Total Loss: {}".format(epoch, EPOCHS, avg_loss/len(train_loader)))
        print("Total Time Elapsed: {} seconds".format(str(current_time-start_time)))
        epoch_times.append(current_time-start_time)
    print("Total Training Time: {} seconds".format(str(sum(epoch_times))))
    return model

def evaluate(model, test_loader):
    model.eval()
    outputs = []
    targets = []
    start_time = time.time()
    for x, label in test_loader:
        out, _ = model(x.to(device).float(), None)
        out = out.squeeze()
        outputs.append(out.cpu().detach().numpy())
        targets.append(label.numpy())

    print("Evaluation Time: {}".format(str(time.time()-start_time)))
    
    # Calculate log loss (cross-entropy loss) for classification
    log_losses = []
    criterion = nn.BCEWithLogitsLoss()
    
    # Calculate log loss (cross-entropy loss) for classification
    log_losses = []
    for i in range(len(outputs)):
        labels = torch.from_numpy(targets[i]).float()
        log_loss_value = criterion(torch.from_numpy(outputs[i]), labels)
        log_losses.append(log_loss_value.item())

    avg_log_loss = np.mean(log_losses)
    print("Average Log Loss: {}".format(avg_log_loss))
    return avg_log_loss,outputs

In [0]:
def data_tensors_rnn_3d(data, target, is_synthetic = False):

    #import libraries
    import pandas as pd
    import numpy as np
    import torch
    from torch.nn.utils.rnn import pad_sequence
    import torch.nn.functional as F
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    
    if is_synthetic:
        #tranform gamePlayId variable to account for synthetic data
        # Create a mask to identify duplicates based on 'gamePlayId', 'frameId', and 'nflId'
        duplicates_mask = data.duplicated(subset=['gamePlayId', 'frameId', 'nflId'], keep='first')

        # Add '.1' to 'gamePlayId' for the second occurrence of each duplicate
        data.loc[duplicates_mask, 'gamePlayId'] += '.1'
        
        
    #Preprocess data correctly
    target_variables = ["tackle_multiple", "tackle_single"]

    #determine target variables to remove
    target_variables.remove(target)

    data = data.sort_values(['gameId','playId','nflId','frameId'],ascending = [True, True, True, True])

    # remove unwanted variables 
    df = data.drop(["gameId","playId"], axis = 1)
    df = df.drop(target_variables, axis = 1)

    # Separate numerical and categorical variables
    numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_vars = df.select_dtypes(include=['object']).columns

    # Define variables to exclude
    exclude_scaling = ['nflId', 'frameId'] #might need to change this depending on added variables
    exclude_scaling.append(target)
    exclude_ohe = ['gamePlayId']

    # Scale numerical variables using StandardScaler, excluding variables
    scaler = StandardScaler()
    df[numerical_vars.difference(exclude_scaling)] = scaler.fit_transform(df[numerical_vars.difference(exclude_scaling)])

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_vars.difference(exclude_ohe), drop_first=True)


    ############################################################################################
    #Group data into correct array format

    # Group data by 'gamePlayId'
    plays_grouped = df.groupby('gamePlayId')

    # Determine the maximum number of frames per play and rows per frame
    max_rows_per_frame = 142 #Max rows per per frame in our data is 140, so 
    num_feature_cols = df.shape[1] - 4 #number of columns - we are droping gamePlayId, frameId, nflId, and target

    #Columns to drop in loop
    cols_to_drop = ['gamePlayId', 'nflId','frameId','tackle_multiple']

    # Initialize lists for all plays' features, labels, and masks
    all_player_features = []
    all_player_labels = []
    all_player_masks = []

    # Initialize list to keep track of gameId, playId, nflId, frameId
    gamePlayId_list = []
    nflId_list = []
    frameId_list = []

    for play_id, play_data in plays_grouped:
        # Group by 'frameId' within each play
        players_grouped = play_data.groupby('nflId')

        for player_id, player_data in players_grouped:
            #drop grouping variables and target variable
            features = player_data.drop(cols_to_drop, axis=1).values #remove the grouping variables
            labels = player_data[target].values

            # Extract the game, play, nflId, and frameId values for this player's data
            gamePlayId = player_data['gamePlayId'].values[0]
            nflId = player_data['nflId'].values[0]
            frameId = player_data['frameId'].values[0]

            # Calculate current frame length for the player
            frame_length = len(features)

            # Pad each player's features and labels to have the same number of rows
            padded_features = np.pad(features, ((0, max_rows_per_frame - frame_length), (0, 0)), mode='constant', constant_values=0)
            padded_labels = np.pad(labels, (0, max_rows_per_frame - frame_length), mode='constant', constant_values=0)

            # Create mask for the current frame
            mask = np.ones(max_rows_per_frame)
            mask[:frame_length] = 1  # Actual data
            mask[frame_length:] = 0  # Padded data

            all_player_features.append(padded_features)
            all_player_labels.append(padded_labels)
            all_player_masks.append(mask)
            gamePlayId_list.extend([gamePlayId] * max_rows_per_frame)
            nflId_list.extend([nflId] * max_rows_per_frame)
            frameId_list.extend([frameId + i for i in range(max_rows_per_frame)])



    # Convert to NumPy arrays
    x = np.array(all_player_features, dtype=np.float32)
    y = np.array(all_player_labels, dtype=np.int64)
    mask_array = np.array(all_player_masks, dtype=np.int64)
    id_data = pd.DataFrame({'gamePlayId': gamePlayId_list, 'nflId': nflId_list, 'frameId': frameId_list})

    return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask_array), id_data

In [0]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [0]:
display(train.head())

In [0]:
x, y, mask, id_data = data_tensors_rnn_3d(train, "tackle_single")

In [0]:
print(x.shape)
print(y.shape)
print(mask.shape)

In [0]:
batch_size = 77
train_data = TensorDataset(x,y)
train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size, drop_last=True)

In [0]:
gru_model = train_nn(train_loader, batch_size = batch_size, EPOCHS = 3, model_type="GRU")
#Lstm_model = train_nn(train_loader, batch_size = batch_size, model_type="LSTM")

In [0]:
avg_log_loss_gru,outputs_gru = evaluate(gru_model, train_loader)

In [0]:
# Example flattened mask as a tensor
flattened_mask = mask.view(-1)
# Create a flattened mask as a list of True and False values
flattened_mask = (flattened_mask == 1).tolist()

In [0]:
#Flattening those GRU values to get outputs with right values
flattened_values_gru = np.concatenate(outputs_gru).ravel()
# Create a new DataFrame from the flattened array
df_flattened_gru = pd.DataFrame(flattened_values_gru, columns=['model_probs_GRU'])
#concatenate the probs with the id data
pred_df_gru = pd.concat([id_data, df_flattened_gru], axis=1)
pred_df_gru = pred_df_gru[flattened_mask]

In [0]:
display(pred_df_gru)