# NFL Big Data Bowl 2024

## Attempting to predict direction and pass/run of the play based on pre-snap information.

## Importing dataview libraries

In [412]:
import re
import itertools

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf


In [413]:
DATA_ROOT = '../data/'

In [414]:
plays = pd.read_csv(DATA_ROOT + 'plays.csv')

## Data cleanup and splitting

In [424]:
def playDescriptionToDirection(play_description):
    play = re.search('(pass (?:short|deep) (?:left|middle|right))|((?:left|right) (?:guard|tackle|end))|(up the middle)', play_description)
    return play.group(0) if play else 'UNKNOWN'

In [431]:
plays['playDirection'] = plays['playDescription'].map(playDescriptionToDirection)
plays['overTenYards'] = plays['yardsToGo'] > 10

numeric_feature_names = ['yardsToGo', 'defendersInTheBox', 'absoluteYardlineNumber']
categoric_feature_names = ['offenseFormation', 'down']
binary_feature_names = ['overTenYards']
target_name = 'playDirection'

df = plays[numeric_feature_names + categoric_feature_names + binary_feature_names + [target_name]].copy()
df = df.dropna()

# Remove fumbled ball as it only occurs 6 times.
fumble_mask = df['playDirection'] == 'UNKNOWN'
df = df[~fumble_mask]

first_downs = df[(df['down'] == 1)]
df = df[~(df['down'] == 1)]

# Undersample to balance the data.
# df_group = df.groupby(target_name)
# df_balanced = df_group.apply(lambda x: x.sample(df_group.size().min()).reset_index(drop=True))
# df = df_balanced

# One-hot encode categoric features.
df = pd.get_dummies(df, columns=categoric_feature_names)

# Shuffle the data.
df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True)

target = df.pop(target_name)
target_raw = target.copy()
target = pd.get_dummies(target)

input_dim = len(df.columns)
output_dim = len(target.columns)

In [429]:
# Split into train, validation, and test sets.
x, x_val = df[:int(len(df) * 0.8)], df[int(len(df) * 0.8):]
y, y_val = target[:int(len(target) * 0.8)], target[int(len(target) * 0.8):]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
x_train = np.asarray(x_train).astype('float32')
x_test = np.asarray(x_test).astype('float32')
x_val = np.asarray(x_val).astype('float32')
y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')
y_val = np.asarray(y_val).astype('float32')

# TF Neural Net Approach

In [435]:
epochs = 20
learning_rate = 0.01
batch_size = 64
layer_count = 5

def make_sequential_net(layers, epochs, batch_size, learning_rate):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Input(shape=(input_dim,)))
  for i in range(layer_count):
      # Use input_size * (2/3) + output_size for hidden layer size.
      model.add(tf.keras.layers.Dense(input_dim*(2/3)+output_dim, activation='relu'))
  model.add(tf.keras.layers.Dense(output_dim, activation='softmax'))

  opt = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate, decay=1e-5)

  model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
  model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
  loss, acc = model.evaluate(x_val, y_val)
  return model, acc

model, acc = make_sequential_net(layer_count, epochs, batch_size, learning_rate)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## SKLearn Decision Tree Classifier

In [432]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
depth = 10
leaf = 20
clf = DecisionTreeClassifier(max_depth=depth, max_leaf_nodes=leaf)
score = np.mean(cross_val_score(clf, df, target_raw, cv=10))
print(score)

0.22640281026097933


## SKLearn Random Forest Approach

In [433]:
from sklearn.ensemble import RandomForestClassifier

In [434]:
clf = RandomForestClassifier(max_depth=70, max_leaf_nodes=20)
score = np.mean(cross_val_score(clf, df, target_raw, cv=10))
print(score)

0.23218528418160894
