# CNN-LSTM TimeDistributed Tackle Probability Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Import a small sample of the data
df = pd.read_csv("sample_data.csv")
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,gameId,playId,nflId,frameId,x,y,unitDir,unitO,force,home,...,defendersInTheBox,offenseFormation,absoluteYardlineNumber,timeSinceStart,surface,inside_outside,presnapDefScoreDiff,weight,position,gamePlayId
0,2022091000.0,56.0,38577.0,6.0,41.89,28.74,87.71,79.47,288.2,1,...,6.0,SHOTGUN,85,0,turf,inside,0,242,ILB,2022090800.056.0
1,2022091000.0,56.0,41239.0,6.0,27.85,29.96,247.65,276.16,364.0,1,...,6.0,SHOTGUN,85,0,turf,inside,0,280,DT,2022090800.056.0
2,2022091000.0,56.0,42816.0,6.0,49.38,7.66,8.33,61.57,346.254545,1,...,6.0,SHOTGUN,85,0,turf,inside,0,184,CB,2022090800.056.0
3,2022091000.0,56.0,43294.0,6.0,41.85,37.85,268.5,230.96,116.290909,1,...,6.0,SHOTGUN,85,0,turf,inside,0,208,CB,2022090800.056.0
4,2022091000.0,56.0,43298.0,6.0,27.89,33.14,293.53,249.12,241.090909,1,...,6.0,SHOTGUN,85,0,turf,inside,0,240,DE,2022090800.056.0


In [14]:
#data_tensors: function to convert data into tensors for use in pytorch
#input: data and taget variable
#returns 3 tensors: x, y, and mask tensors
def data_tensors(data, target):
    #import libraries
    import pandas as pd
    import numpy as np
    import torch
    from torch.nn.utils.rnn import pad_sequence
    import torch.nn.functional as F
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder


    #Preprocess data correctly
    #target variables
    target_variables = ["tackle_binary_single","tackle_binary_all", "tackle_nonbinary_all", "tackle_nonbinary_single"]

    #determine target variables to remove
    target_variables.remove(target)

    # remove unwanted variables 
    df = data.drop(["gameId","playId","nflId"], axis = 1)
    df = df.drop(target_variables, axis = 1)

    # Separate numerical and categorical variables
    numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_vars = df.select_dtypes(include=['object']).columns

    # Define variables to exclude
    exclude_scaling = ['gameId', 'frameId', 'home'] #might need to change this depending on added variables
    exclude_scaling.append(target)
    exclude_ohe = ['gamePlayId']

    # Scale numerical variables using StandardScaler, excluding variables
    scaler = StandardScaler()
    df[numerical_vars.difference(exclude_scaling)] = scaler.fit_transform(df[numerical_vars.difference(exclude_scaling)])

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_vars.difference(exclude_ohe), drop_first=True)

    ############################################################################################
    #Group data into correct array format

    # Group data by 'gamePlayId'
    plays_grouped = df.groupby('gamePlayId')

    # Determine the maximum number of frames per play and rows per frame
    max_frames_per_play = 140 #We found that the max number of frames per play in our data if 140
    max_rows_per_frame = 11 #Set this to 11 players just a precaution, but all frames have ll players on a given frame in our data
    num_feature_cols = df.shape[1] - 3 #number of columns - we are droping gamePlayId, frameId, and target

    #Columns to drop in loop
    cols_to_drop = ['gamePlayId', 'frameId', 'tackle_binary_single']

    # Initialize lists for all plays' features, labels, and masks
    all_play_features = []
    all_play_labels = []
    all_play_masks = []

    for play_id, play_data in plays_grouped:
        # Group by 'frameId' within each play
        frames_grouped = play_data.groupby('frameId')
        
        # Initialize lists for all frames within a play (features, labels, and masks)
        play_features = []
        play_labels = []
        play_masks = []

        for frame_id, frame_data in frames_grouped:
            #drop grouping variables and target variable
            features = frame_data.drop(cols_to_drop, axis=1).values #remove the grouping variables
            labels = frame_data['tackle_binary_single'].values

            # Calculate current frame length
            frame_length = len(features)

            # Pad each frame's features and labels to have the same number of rows
            padded_features = np.pad(features, ((0, max_rows_per_frame - frame_length), (0, 0)), mode='constant', constant_values=0)
            padded_labels = np.pad(labels, (0, max_rows_per_frame - frame_length), mode='constant', constant_values=0)
            
            # Create mask for the current frame
            mask = np.ones(max_rows_per_frame)
            mask[:frame_length] = 1  # Actual data
            mask[frame_length:] = 0  # Padded data

            play_features.append(padded_features)
            play_labels.append(padded_labels)
            play_masks.append(mask)

        frames_to_pad = max_frames_per_play - len(play_features)
        play_features += [np.zeros((max_rows_per_frame, num_feature_cols)) for _ in range(frames_to_pad)]
        play_labels += [np.zeros((max_rows_per_frame,)) for _ in range(frames_to_pad)]
        play_masks += [np.zeros((max_rows_per_frame,)) for _ in range(frames_to_pad)]

        all_play_features.append(play_features)
        all_play_labels.append(play_labels)
        all_play_masks.append(play_masks)

    # Convert to NumPy arrays
    x = np.array(all_play_features, dtype=np.float32)
    y = np.array(all_play_labels, dtype=np.float32)
    mask_array = np.array(all_play_masks, dtype=np.float32)


    # Convert to PyTorch tensors and return them
    return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask_array)


In [18]:
# Prepare & split the data 
target_variable = 'tackle_binary_single'
x_data, y_data, mask_data = data_tensors(df, target_variable)
split_ratio = 0.8

# Total number of plays
total_plays = x_data.size(0)

# Calculate the number of plays in the training set
num_train_plays = int(total_plays * split_ratio)

# Splitting the tensors into training and testing sets
x_train = x_data[:num_train_plays]
y_train = y_data[:num_train_plays]
mask_train = mask_data[:num_train_plays]

x_test = x_data[num_train_plays:]
y_test = y_data[num_train_plays:]
mask_test = mask_data[num_train_plays:]

# Print shapes to verify
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)


x_train shape: torch.Size([11, 140, 11, 77])
y_train shape: torch.Size([11, 140, 11])
x_test shape: torch.Size([3, 140, 11, 77])
y_test shape: torch.Size([3, 140, 11])


In [29]:
# Step 1: Define and Test Model with Conv2D Layer Only
class CNNModelTest(nn.Module):
    def __init__(self, num_players, num_features):
        super(CNNModelTest, self).__init__()
        self.conv2d = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding='same')

    def forward(self, x):
        batch_size, timesteps, players, features = x.size()
        x = x.view(batch_size * timesteps, 1, players, features)  # Adding channel dimension
        x = F.relu(self.conv2d(x))
        return x

# Define the model
model_test = CNNModelTest(num_players=11, num_features=77)

# Forward pass with test data
x_sample = x_train[:1]  # Using only the first play for testing
output = model_test(x_sample)
print("Output shape after Conv2D:", output.shape)


Output shape after Conv2D: torch.Size([140, 32, 11, 77])


In [30]:
# Step 2: Add LSTM Layer
class CNNLSTMModelTest(nn.Module):
    def __init__(self, num_players, num_features, hidden_size):
        super(CNNLSTMModelTest, self).__init__()
        self.conv2d = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding='same')
        self.hidden_size = hidden_size

        # Placeholder for LSTM input size
        self.lstm_input_size = None  
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_size, batch_first=True)  # Initialize with placeholder input size

    def forward(self, x):
        batch_size, timesteps, players, features = x.size()
        x = x.view(batch_size * timesteps, 1, players, features)
        x = F.relu(self.conv2d(x))

        # Dynamically calculate LSTM input size based on Conv2D output size
        if self.lstm_input_size is None:
            _, C, H, W = x.size()
            self.lstm_input_size = C * H * W
            self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=self.hidden_size, batch_first=True)

        # Reshape for LSTM
        x = x.view(batch_size, timesteps, -1)
        lstm_out, (hn, cn) = self.lstm(x)
        return lstm_out

# Instantiate the model
model_test = CNNLSTMModelTest(num_players=11, num_features=77, hidden_size=50)

# Forward pass with test data
output = model_test(x_sample)
print("Output shape after LSTM:", output.shape)



Output shape after LSTM: torch.Size([1, 140, 50])


In [31]:
# Step 3: Adding the Fully Connected Layer (TimeDistributed equivalent)
class CNNLSTMModel(nn.Module):
    def __init__(self, num_players, num_features, hidden_size, num_classes):
        super(CNNLSTMModel, self).__init__()
        self.conv2d = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), stride=1, padding='same')
        self.hidden_size = hidden_size
        self.lstm_input_size = None
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_size, batch_first=True)  # Placeholder input size
        self.fc = nn.Linear(hidden_size, num_players * num_classes)

    def forward(self, x):
        batch_size, timesteps, players, features = x.size()
        x = x.view(batch_size * timesteps, 1, players, features)
        x = F.relu(self.conv2d(x))

        if self.lstm_input_size is None:
            _, C, H, W = x.size()
            self.lstm_input_size = C * H * W
            self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=self.hidden_size, batch_first=True)

        x = x.view(batch_size, timesteps, -1)
        lstm_out, (hn, cn) = self.lstm(x)

        # Apply the fully connected layer
        out = self.fc(lstm_out)
        out = out.view(batch_size, timesteps, players, -1)
        out = torch.sigmoid(out)

        return out

# Define and test the full model
model = CNNLSTMModel(num_players=11, num_features=77, hidden_size=50, num_classes=1)
output = model(x_sample)
print("Output shape after fully connected layer:", output.shape)


Output shape after fully connected layer: torch.Size([1, 140, 11, 1])


In [32]:
# Loss function and optimizer
import torch.optim as optim

# Loss function
criterion = nn.BCEWithLogitsLoss()

# Optimizer (You can choose optimizers like Adam, SGD, etc.)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [38]:
# Training Loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    # Assuming x_train and y_train are lists of tensors
    for inputs, labels in zip(x_train, y_train):
        # Add a batch dimension to inputs and labels
        inputs = inputs.unsqueeze(0)  # Shape: [1, 140, 11, 77]
        labels = labels.unsqueeze(0)  # Shape: [1, 140, 11]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(-1))  # Add the channel dimension to labels

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print statistics
    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(x_train)}')

print('Finished Training')


Epoch 1, Loss: 0.7069582451473583
Epoch 2, Loss: 0.7052704420956698
Epoch 3, Loss: 0.7038454846902327
Epoch 4, Loss: 0.702070106159557
Epoch 5, Loss: 0.7009985175999728
Epoch 6, Loss: 0.7002423026344993
Epoch 7, Loss: 0.6995979655872692
Epoch 8, Loss: 0.6988450559702787
Epoch 9, Loss: 0.6983056122606451
Epoch 10, Loss: 0.6979116255586798
Epoch 11, Loss: 0.697566731409593
Epoch 12, Loss: 0.6972624876282432
Epoch 13, Loss: 0.6969924135641619
Epoch 14, Loss: 0.6967510851946744
Epoch 15, Loss: 0.6965342976830222
Epoch 16, Loss: 0.6963327418674122
Epoch 17, Loss: 0.6961363174698569
Epoch 18, Loss: 0.69589825110002
Epoch 19, Loss: 0.6957297162576155
Epoch 20, Loss: 0.6955983747135509
Finished Training


In [39]:
# Evaluate the model
model.eval()  # Set the model to evaluation mode
test_loss = 0.0
num_batches = 0

with torch.no_grad():  # No need to track gradients during evaluation
    for inputs, labels in zip(x_test, y_test):
        # Add a batch dimension to inputs and labels
        inputs = inputs.unsqueeze(0)  # Shape: [1, 140, 11, 77]
        labels = labels.unsqueeze(0)  # Shape: [1, 140, 11]

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(-1))  # Add the channel dimension to labels for BCELoss

        test_loss += loss.item()
        num_batches += 1

average_test_loss = test_loss / num_batches
print(f'Average Test Loss: {average_test_loss}')


Average Test Loss: 0.6955373287200928


In [43]:
# Calculating Precision, Recall, and F1 Score
from sklearn.metrics import precision_score, recall_score, f1_score

# Variables to hold predictions and true labels
all_predictions = []
all_labels = []

# Evaluate model
model.eval()
with torch.no_grad():
    for inputs, labels in zip(x_test, y_test):
        inputs = inputs.unsqueeze(0)  # Add batch dimension
        labels = labels.unsqueeze(0)  # Add batch dimension
        outputs = model(inputs)
        all_predictions.append(outputs.view(-1).cpu().numpy())
        all_labels.append(labels.view(-1).cpu().numpy())

# Flatten the lists
all_predictions = np.hstack(all_predictions)
all_labels = np.hstack(all_labels)

# Calculate metrics
# precision = precision_score(all_labels, all_predictions)
# recall = recall_score(all_labels, all_predictions)
# f1 = f1_score(all_labels, all_predictions)

# print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}')


In [44]:
all_predictions

array([0.0172578 , 0.01510831, 0.01476363, ..., 0.0055955 , 0.0048889 ,
       0.00438096], dtype=float32)