# NFL Big Data Bowl 2024

## Attempting to predict direction and pass/run of the play based on pre-snap information.

## Importing dataview libraries

In [335]:
import re

import pandas as pd
import numpy as np

from xgboost import XGBClassifier


In [336]:
DATA_ROOT = '../data/'

In [337]:
plays = pd.read_csv(DATA_ROOT + 'plays.csv')

## Data cleanup and splitting

In [338]:
def playDescriptionToDirection(play_description):
    play = re.search('(pass (?:short|deep) (?:left|middle|right))|((?:left|right) (?:guard|tackle|end))|(up the middle)', play_description)
    play = play.group(0) if play else 'UNKNOWN'
    if play == 'UNKNOWN':
        return play
    # Simplify the direction
    if 'deep' in play:
        play = 'PASS'
    elif 'short' in play:
        play = 'PASS'
    elif 'end' in play:
        play = 'RUN'
    else:
        play = 'RUN'
    return play

In [363]:
from sklearn.preprocessing import OrdinalEncoder

# Add a column for the play direction and whether the play was over 10 yards.
plays['playDirection'] = plays['playDescription'].map(playDescriptionToDirection)
plays['TenPlusYards'] = plays['yardsToGo'] >= 10

# Define the features and target.
numerical_feature_names = ['yardsToGo', 'defendersInTheBox', 'absoluteYardlineNumber']
categorical_feature_names = ['offenseFormation', 'down']
binary_feature_names = ['TenPlusYards']
target_name = 'playDirection'

df = plays[numerical_feature_names + categorical_feature_names + binary_feature_names + [target_name]].copy()
# df = df.dropna()

# Remove fumbled ball as it only occurs 6 times.
fumble_mask = df['playDirection'] == 'UNKNOWN'
df = df[~fumble_mask]

# first_downs = df[(df['down'] == 1)]
# df = df[~(df['down'] == 1)]

df[categorical_feature_names] = df[categorical_feature_names].astype('category')
df[binary_feature_names] = df[binary_feature_names].astype('bool')
df= pd.get_dummies(df, columns=categorical_feature_names)

encoder = OrdinalEncoder()
df[target_name] = encoder.fit_transform(df[target_name].values.reshape(-1, 1))
uniques = df[target_name].nunique()
target_strings = encoder.inverse_transform(np.arange(uniques).reshape(-1,1)).flatten()
print(target_strings)

# Shuffle the data.
cols = list(df.columns)
df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True)


target = df.pop(target_name)
# target_raw = target.copy()
target.describe()
print(target.shape)
print(df.shape)

['PASS' 'RUN']
(12480,)
(12480, 15)


In [364]:
from sklearn.model_selection import train_test_split 
# Split into train, validation, and test sets.
x, x_test = df[:int(len(df) * 0.8)], df[int(len(df) * 0.8):]
y, y_test = target[:int(len(target) * 0.8)], target[int(len(target) * 0.8):]
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
# x_train = np.asarray(x_train).astype('float32')
# x_test = np.asarray(x_test).astype('float32')
# x_val = np.asarray(x_val).astype('float32')
# y_train = np.asarray(y_train).astype('float32')
# y_test = np.asarray(y_test).astype('float32')
# y_val = np.asarray(y_val).astype('float32')
print(x_train.describe())
print(y_train.describe())

         yardsToGo  defendersInTheBox  absoluteYardlineNumber
count  7987.000000        7983.000000             7987.000000
mean      8.459246           6.405612               60.292100
std       3.879081           1.006906               23.807211
min       1.000000           2.000000               11.000000
25%       6.000000           6.000000               41.000000
50%      10.000000           6.000000               61.000000
75%      10.000000           7.000000               80.000000
max      38.000000          11.000000              109.000000
count    7987.000000
mean        0.542632
std         0.498210
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: playDirection, dtype: float64


In [408]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def get_class_report(n_estimators, max_depth, learning_rate):
    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, n_jobs=12, early_stopping_rounds=10, subsample=.2, colsample_bytree=1)
    pipeline = Pipeline(steps=[('model', model)])
    pipeline.fit(x_train, y_train, model__eval_set=[(x_val, y_val)], model__verbose=False)
    preds = pipeline.predict(x_test)
    return preds, classification_report(y_test, preds, zero_division=0, target_names=target_strings)

In [407]:
preds, report = get_class_report(100, 10, 0.01)
print(report)
preds_str = encoder.inverse_transform(preds.reshape(-1,1)).flatten()
y_test_str = encoder.inverse_transform(y_test.values.reshape(-1,1)).flatten()
for (pred, actual) in zip(preds_str, y_test_str):
    print('\033[92m' if pred == actual else '\033[91m',pred, '--',actual)

              precision    recall  f1-score   support

        PASS       0.70      0.65      0.68      1094
         RUN       0.74      0.79      0.76      1402

    accuracy                           0.73      2496
   macro avg       0.72      0.72      0.72      2496
weighted avg       0.73      0.73      0.73      2496

[92m PASS -- PASS
[92m RUN -- RUN
[92m RUN -- RUN
[91m RUN -- PASS
[92m PASS -- PASS
[92m RUN -- RUN
[91m PASS -- RUN
[91m PASS -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[91m PASS -- RUN
[91m RUN -- PASS
[92m RUN -- RUN
[92m RUN -- RUN
[91m RUN -- PASS
[92m RUN -- RUN
[92m PASS -- PASS
[92m RUN -- RUN
[92m RUN -- RUN
[92m PASS -- PASS
[92m RUN -- RUN
[91m PASS -- RUN
[92m RUN -- RUN
[91m PASS -- RUN
[92m PASS -- PASS
[91m RUN -- PASS
[92m PASS -- PASS
[92m RUN -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[92m RUN -- RUN
[92m PASS -- PASS
[91m RUN -- PASS
[92m PASS -- PASS
[91m RUN -- PASS
