In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import math
import statsmodels.api as sm
import warnings
from IPython.core.interactiveshell import InteractiveShell
from scipy import stats
import multiprocessing as mp

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

mp.set_start_method("fork")


InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

In [2]:
#Read-In Data
df = pd.read_csv('20231031_AllGames_wAdvancedTracking.csv')

In [3]:
#Drop Columns that Can Lead to Memorizaton
df = df.drop(columns = ['gameId','playId','event','tackle_x','tackle_y','playTime'])

#One-Hot Encode "Position" Variable
df = df.join(pd.get_dummies(df['position'], dtype=float))
df = df.drop(columns = ['position'])

#Replacing playerType with defense
df.loc[(df.playerType != 'Offense')&(df.playerType != 'ballCarrier'), 'playerType'] = 'Defense'

#One-Hot Encode "playerType" Variable
df = df.join(pd.get_dummies(df['playerType'], dtype=float))
df = df.drop(columns = ['playerType'])

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

##Normalize the Data (MINMAX)
columns_to_normalize = ['x', 'y', 's', 'a', 'dis', 'o', 'dir','distance_to_ballCarrier']
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

##Normalize the Data (STANDARD)
columns_to_normalize = ['age', 'height','weight']
scaler = StandardScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

In [5]:
df = df[['key', 'nflId', 'frameId', 'tackle', 'x', 'y', 's', 'a', 'dis', 'o', 'dir',
       'height', 'weight', 'age', 'distance_to_ballCarrier', 'C', 'CB',
       'DB', 'DE', 'DT', 'FB', 'FS', 'G', 'ILB', 'MLB', 'NT', 'OLB', 'QB',
       'RB', 'SS', 'T', 'TE', 'WR', 'Defense', 'Offense', 'ballCarrier']]

In [6]:
#List of Unique Plays
keys = df['key'].unique()

#Max Number of Frames in One Play
max_frames = df['frameId'].unique().max()

#Number of Players per Frame
num_players = 22

#Empty arrays for Input Data (X) and target data (y)
X = []
y = []

i = 0

for key in keys:
    play_data = df[df.key == key]
        
    #List of Frames for this Key
    frames = np.zeros((max_frames, 22, 32))
    tackle_labels = np.zeros((max_frames, 22))
    play_data = play_data.sort_values(by = ['frameId', 'nflId']).reset_index(drop = True)
    
    for frame in play_data['frameId'].unique():
        frame_data_acc = []
        tackle_label_acc = []
    
        frame_data_rows = play_data[play_data['frameId'] == frame]
        
        for _, row in frame_data_rows.iterrows():
            frame_data = row.iloc[4:].values
            frame_data_acc.append(frame_data)
            tackle_label_acc.append(row['tackle'])
    
        frame_data = np.stack(frame_data_acc, axis = 0)
        tackle_label = np.array(tackle_label_acc)
        
        frames[frame - 1] = frame_data
        tackle_labels[frame - 1] = tackle_label

    i += 1
    if i % 100 == 0:
        print(i, ' Keys Completed of 10041.')

    X.append(frames)
    y.append(tackle_labels)

100  Keys Completed of 10041.
200  Keys Completed of 10041.
300  Keys Completed of 10041.
400  Keys Completed of 10041.
500  Keys Completed of 10041.
600  Keys Completed of 10041.
700  Keys Completed of 10041.
800  Keys Completed of 10041.
900  Keys Completed of 10041.
1000  Keys Completed of 10041.
1100  Keys Completed of 10041.
1200  Keys Completed of 10041.
1300  Keys Completed of 10041.
1400  Keys Completed of 10041.
1500  Keys Completed of 10041.
1600  Keys Completed of 10041.
1700  Keys Completed of 10041.
1800  Keys Completed of 10041.
1900  Keys Completed of 10041.
2000  Keys Completed of 10041.
2100  Keys Completed of 10041.
2200  Keys Completed of 10041.
2300  Keys Completed of 10041.
2400  Keys Completed of 10041.
2500  Keys Completed of 10041.
2600  Keys Completed of 10041.
2700  Keys Completed of 10041.
2800  Keys Completed of 10041.
2900  Keys Completed of 10041.
3000  Keys Completed of 10041.
3100  Keys Completed of 10041.
3200  Keys Completed of 10041.
3300  Keys Comple

In [7]:
#Shaping the data to the correct form
X_reshaped = np.array(X) #(10041, 127, 22, 32)
y_reshaped = np.array(y) #(10041, 127, 22)

In [8]:
np.savez('data.npz', X = X_reshaped, y = y_reshaped)

In [325]:
#Splitting into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_reshaped, test_size = 0.2, random_state = 42)

y_train = y_train[:, 0, :]

In [327]:
#Define the Input Shape (127 frames, 22 players, 32 field of data)
input_shape = (127, 22, 32)
num_players = 22

#Create a Sequential Model for TF
model = keras.models.Sequential()

#Add Input Layer to the Model (Trying 2D CNN as it is like image data)
model.add(keras.layers.Input(shape = input_shape))

#Add Convolution Layers
model.add(keras.layers.Conv2D(64, (3,3), activation = 'relu'))
model.add(keras.layers.MaxPooling2D((2,2)))
model.add(keras.layers.Conv2D(127, (3,3), activation = 'relu'))
model.add(keras.layers.MaxPooling2D((2,2)))

#Flatten and Add Dense Layers
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(256, activation = 'relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(num_players, activation = 'sigmoid'))

#Compile the model
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

#Summarize the model
model.summary()

Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 125, 20, 64)       18496     
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 62, 10, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 60, 8, 127)        73279     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 30, 4, 127)       0         
 g2D)                                                            
                                                                 
 flatten_5 (Flatten)         (None, 15240)             0         
                                                                 
 dense_87 (Dense)            (None, 256)             

In [None]:
#Fit the Model
history = model.fit(X_train, y_train,
                    epochs = 10,
                    validation_split = 0.2,
                    batch_size = 32,
                    callbacks=[keras.callbacks.ModelCheckpoint("DNN_model.h5",save_best_only=True,save_weights_only=False),
                               keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)])

In [320]:
X_train.shape

(8032, 127, 22, 32)

In [326]:
y_train.shape

(8032, 22)