# Model Preparation
The following notebook contains the neccessary functions to prepare data into tensors for proper neural network use given the architecture of our model.

## data_tensors_cnn_4d

The data_tensor_cnn_4d function performs the following operations on the data to prep for the model:
1. Remove unwanted target variables, nflId, gameId, and playId
2. Scale numerical data, excluding the gamePlayId, frameId, and binary values
3. One-hot encode all categorical variables 
4. Group plays and frames into data "images" of player data on a given frame
5. Pad arrays for consitent tensor datatypes 
6. Create mask array with a 1 to identify frames that were actual data and 0 to identify padded values

In [0]:
def data_tensors_cnn_4d(data, target, is_synthetic=False):
    import pandas as pd
    import numpy as np
    import torch
    from torch.nn.utils.rnn import pad_sequence
    import torch.nn.functional as F
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    
    if is_synthetic:
        #tranform gamePlayId variable to account for synthetic data
        # Create a mask to identify duplicates based on 'gamePlayId', 'frameId', and 'nflId'
        duplicates_mask = data.duplicated(subset=['gamePlayId', 'frameId', 'nflId'], keep='first')

        # Add '.1' to 'gamePlayId' for the second occurrence of each duplicate
        data.loc[duplicates_mask, 'gamePlayId'] += '.1'

    # Preprocess data correctly
    # Target variables
    target_variables = ["tackle_binary_single","tackle_binary_all", "tackle_nonbinary_all", "tackle_nonbinary_single"]

    # Determine target variables to remove
    target_variables.remove(target)

    # Remove unwanted variables 
    df = data.drop(["gameId","playId",], axis=1)
    df = df.drop(target_variables, axis=1)

    # Separate numerical and categorical variables
    numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_vars = df.select_dtypes(include=['object']).columns

    # Define variables to exclude
    exclude_scaling = ['gameId', 'frameId','nflId', 'home'] # Might need to change this depending on added variables
    exclude_scaling.append(target)
    exclude_ohe = ['gamePlayId']

    # Scale numerical variables using StandardScaler, excluding variables
    scaler = StandardScaler()
    df[numerical_vars.difference(exclude_scaling)] = scaler.fit_transform(df[numerical_vars.difference(exclude_scaling)])

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_vars.difference(exclude_ohe), drop_first=True)

    ############################################################################################
    # Group data into correct array format

    # Group data by 'gamePlayId'
    plays_grouped = df.groupby('gamePlayId')

    # Determine the maximum number of frames per play and rows per frame
    max_frames_per_play = 140 # We found that the max number of frames per play in our data if 140
    max_rows_per_frame = 11 # Set this to 11 players just a precaution, but all frames have all players on a given frame in our data
    num_feature_cols = df.shape[1] - 4 # Number of columns - we are dropping gamePlayId, frameId, nflId and target

    # Columns to drop in loop
    cols_to_drop = ['gamePlayId', 'frameId','nflId', 'tackle_binary_single']

    # Initialize lists for all plays' features, labels, and masks
    all_play_features = []
    all_play_labels = []
    all_play_masks = []
    gamePlayId_list = []
    nflId_list = []
    frameId_list = []

    for play_id, play_data in plays_grouped:
        # Group by 'frameId' within each play
        frames_grouped = play_data.groupby('frameId')
        
        # Initialize lists for all frames within a play (features, labels, and masks)
        play_features = []
        play_labels = []
        play_masks = []
        play_gamePlayId_list = []
        play_nflId_list = []
        play_frameId_list = []

        for frame_id, frame_data in frames_grouped:
            # Drop grouping variables and target variable
            features = frame_data.drop(cols_to_drop, axis=1).values # Remove the grouping variables
            labels = frame_data['tackle_binary_single'].values

            # Extract the game, play, nflId, and frameId values for this player's data
            gamePlayId = frame_data['gamePlayId'].values[0]
            nflId = frame_data['nflId'].values[0]
            frameId = frame_data['frameId'].values[0]

            # Calculate current frame length
            frame_length = len(features)

            # Pad each frame's features and labels to have the same number of rows
            padded_features = np.pad(features, ((0, max_rows_per_frame - frame_length), (0, 0)), mode='constant', constant_values=0)
            padded_labels = np.pad(labels, (0, max_rows_per_frame - frame_length), mode='constant', constant_values=0)
            
            # Create mask for the current frame
            mask = np.ones(max_rows_per_frame)
            mask[:frame_length] = 1  # Actual data
            mask[frame_length:] = 0  # Padded data

            play_features.append(padded_features)
            play_labels.append(padded_labels)
            play_masks.append(mask)
            
            # Extend the gameId, playId, nflId, and frameId lists
            play_gamePlayId_list.extend([gamePlayId] * max_rows_per_frame)
            play_nflId_list.extend([nflId] * max_rows_per_frame)
            play_frameId_list.extend([frameId + i for i in range(max_rows_per_frame)])

        frames_to_pad = max_frames_per_play - len(play_features)
        play_features += [np.zeros((max_rows_per_frame, num_feature_cols)) for _ in range(frames_to_pad)]
        play_labels += [np.zeros((max_rows_per_frame,)) for _ in range(frames_to_pad)]
        play_masks += [np.zeros((max_rows_per_frame,)) for _ in range(frames_to_pad)]
        
        # Extend the ID lists for the padding frames
        play_gamePlayId_list.extend([gamePlayId] * max_rows_per_frame * frames_to_pad)
        play_nflId_list.extend([nflId] * max_rows_per_frame * frames_to_pad)
        play_frameId_list.extend([frameId + max_rows_per_frame * i for i in range(frames_to_pad) for _ in range(max_rows_per_frame)])


        all_play_features.append(play_features)
        all_play_labels.append(play_labels)
        all_play_masks.append(play_masks)
        gamePlayId_list.extend(play_gamePlayId_list)
        nflId_list.extend(play_nflId_list)
        frameId_list.extend(play_frameId_list)

    # Convert to NumPy arrays
    x = np.array(all_play_features, dtype=np.float32)
    y = np.array(all_play_labels, dtype=np.int64)
    mask_array = np.array(all_play_masks, dtype=np.int64)

    # Create a DataFrame to hold gameId, playId, nflId, and frameId values
    id_data = pd.DataFrame({'gamePlayId': gamePlayId_list, 'nflId': nflId_list, 'frameId': frameId_list})

    # Convert to PyTorch tensors and return them along with id_data
    return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask_array), id_data


In [0]:
#import pandas as pd
#train_samp = pd.read_csv("../Data/train_sample.csv")

In [0]:
#x, y, mask, id_data = data_tensors(train_samp, "tackle_binary_single")

In [0]:
#print(x.shape)
#print(y.shape)
#print(mask.shape)
#display(id_data.head())

## data_tensors_rnn_3d

In [0]:
def data_tensors_rnn_3d(data, target, is_synthetic = False):

    #import libraries
    import pandas as pd
    import numpy as np
    import torch
    from torch.nn.utils.rnn import pad_sequence
    import torch.nn.functional as F
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    
    if is_synthetic:
        #tranform gamePlayId variable to account for synthetic data
        # Create a mask to identify duplicates based on 'gamePlayId', 'frameId', and 'nflId'
        duplicates_mask = data.duplicated(subset=['gamePlayId', 'frameId', 'nflId'], keep='first')

        # Add '.1' to 'gamePlayId' for the second occurrence of each duplicate
        data.loc[duplicates_mask, 'gamePlayId'] += '.1'
        
        
    #Preprocess data correctly
    target_variables = ["tackle_multiple", "tackle_single"]

    #determine target variables to remove
    target_variables.remove(target)

    data = data.sort_values(['gameId','playId','nflId','frameId'],ascending = [True, True, True, True])

    # remove unwanted variables 
    df = data.drop(["gameId","playId"], axis = 1)
    df = df.drop(target_variables, axis = 1)

    # Separate numerical and categorical variables
    numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_vars = df.select_dtypes(include=['object']).columns

    # Define variables to exclude
    exclude_scaling = ['nflId', 'home', 'frameId'] #might need to change this depending on added variables
    exclude_scaling.append(target)
    exclude_ohe = ['gamePlayId']

    # Scale numerical variables using StandardScaler, excluding variables
    scaler = StandardScaler()
    df[numerical_vars.difference(exclude_scaling)] = scaler.fit_transform(df[numerical_vars.difference(exclude_scaling)])

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_vars.difference(exclude_ohe), drop_first=True)


    ############################################################################################
    #Group data into correct array format

    # Group data by 'gamePlayId'
    plays_grouped = df.groupby('gamePlayId')

    # Determine the maximum number of frames per play and rows per frame
    max_rows_per_frame = 140 #Max rows per per frame in our data is 140, so 
    num_feature_cols = df.shape[1] - 4 #number of columns - we are droping gamePlayId, frameId, nflId, and target

    #Columns to drop in loop
    cols_to_drop = ['gamePlayId', 'nflId','frameId', 'tackle_single','tackle_multiple']

    # Initialize lists for all plays' features, labels, and masks
    all_player_features = []
    all_player_labels = []
    all_player_masks = []

    # Initialize list to keep track of gameId, playId, nflId, frameId
    gamePlayId_list = []
    nflId_list = []
    frameId_list = []

    for play_id, play_data in plays_grouped:
        # Group by 'frameId' within each play
        players_grouped = play_data.groupby('nflId')

        for player_id, player_data in players_grouped:
            #drop grouping variables and target variable
            features = player_data.drop(cols_to_drop, axis=1).values #remove the grouping variables
            labels = player_data[target].values

            # Extract the game, play, nflId, and frameId values for this player's data
            gamePlayId = player_data['gamePlayId'].values[0]
            nflId = player_data['nflId'].values[0]
            frameId = player_data['frameId'].values[0]

            # Calculate current frame length for the player
            frame_length = len(features)

            # Pad each player's features and labels to have the same number of rows
            padded_features = np.pad(features, ((0, max_rows_per_frame - frame_length), (0, 0)), mode='constant', constant_values=0)
            padded_labels = np.pad(labels, (0, max_rows_per_frame - frame_length), mode='constant', constant_values=0)

            # Create mask for the current frame
            mask = np.ones(max_rows_per_frame)
            mask[:frame_length] = 1  # Actual data
            mask[frame_length:] = 0  # Padded data

            all_player_features.append(padded_features)
            all_player_labels.append(padded_labels)
            all_player_masks.append(mask)
            gamePlayId_list.extend([gamePlayId] * max_rows_per_frame)
            nflId_list.extend([nflId] * max_rows_per_frame)
            frameId_list.extend([frameId + i for i in range(max_rows_per_frame)])



    # Convert to NumPy arrays
    x = np.array(all_player_features, dtype=np.float32)
    y = np.array(all_player_labels, dtype=np.int64)
    mask_array = np.array(all_player_masks, dtype=np.int64)
    id_data = pd.DataFrame({'gamePlayId': gamePlayId_list, 'nflId': nflId_list, 'frameId': frameId_list})

    return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask_array), id_data

In [0]:
#import pandas as pd
#train_samp = pd.read_csv("../Data/train_sample.csv")

In [0]:
#x, y, mask, id_data = data_tensors_rnn_3d(train_samp, "tackle_binary_single")

In [0]:
#print(x.shape)
#print(y.shape)
#print(mask.shape)
#print(id_data.shape)
#display(id_data.head())

In [0]:
# #import pandas as pd
# train_samp = pd.read_csv("../Data/train_sample.csv")
# x, y, mask, id_data = data_tensors_cnn_4d(train_samp, "tackle_binary_single")

In [0]:
# print(x.shape)
# print(y.shape)
# print(mask.shape)
# print(id_data.shape)
# display(id_data.head())