In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import math
import statsmodels.api as sm
import warnings
from IPython.core.interactiveshell import InteractiveShell
from scipy import stats
import multiprocessing as mp
mp.set_start_method("fork")


InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')

### Functions

In [32]:
def playAnimator(data, GAMEID, PLAYID):
    color_mapping = {
        'Offense': 'cornsilk',
        'Defense': 'lightblue',
        'football': 'brown',
        'Tackle': 'red',
        'MissedTackle': 'gold',
        'football':'#825736',
        'ballCarrier': 'lime',
        'assist':'darkorange',
        'forcedFumble':'fuchsia'
    }

    play_data = plays[(plays.gameId == GAMEID)&(plays.playId == PLAYID)]
    
    fig = fig = px.scatter(data, x = 'x', y = 'y', range_x = [0,120], range_y = [0, 53.3],
              color = 'playerType', animation_frame = 'frameId', hover_name = 'distanceFromCarrier',
              color_discrete_map=color_mapping, width=1000, height=565)
    
    fig = fig.add_vrect(x0=0, x1=10, fillcolor = 'gray')
    fig = fig.add_vrect(x0=110, x1=120, fillcolor = 'gray')
    fig = fig.add_vline(x = int(play_data['absoluteYardlineNumber']), line_dash="dash", 
                        line_color="blue", opacity = 0.5)
    fig = fig.add_vline(x = int(play_data['absoluteYardlineNumber']) + int(play_data['yardsToGo']),
                        line_dash="dash", line_color="yellow", opacity = 0.5)
    fig = fig.update_layout(
        plot_bgcolor='#567D46'
    )
    fig = fig.update_yaxes(visible=False)
    fig = fig.update_layout(xaxis=dict(
        tick0=0,
        dtick=10,))
    
    fig = fig.update_traces(marker=dict(size=12,line=dict(width=2,color='black'), opacity=0.9),
                      selector=dict(mode='markers'))
    
    # Set frame duration to 100ms
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 100
    
    # Set transition xduration between frames to 0ms
    fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 0
    
    return fig



#Calculating Distance from Each Player to the ballCarrier
# Step 1: Filter for the ball carrier
ball_carrier_df = df[df['playerType'] == 'ballCarrier']
ball_carrier_df = ball_carrier_df[['key','frameId','nflId','x','y']]


# Function to Calculate Distances from Player to BallCarrier for a Play (Based on the Key)
def calculate_distances_from_ballCarrier(key):
    key_df = df[df['key'] == key]
    key_df = key_df[['key','frameId','x','y','nflId']]

    # Create an empty DataFrame to store the distances
    distances = []

    for frame in key_df['frameId'].unique():
        frame_df = key_df[key_df['frameId'] == frame]
        ballCarrierdata = ball_carrier_df[(ball_carrier_df['key'] == key) & (ball_carrier_df['frameId'] == frame)]
        if ballCarrierdata.empty:
            continue

        x1, y1 = frame_df['x'].values, frame_df['y'].values
        x2, y2 = ballCarrierdata[['x', 'y']].values[0]

        distance = np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
        frame_df['distance_to_ballCarrier'] = distance

        distances.append(frame_df)
    

    return pd.concat(distances, ignore_index = True)

### Read Data

In [3]:
#Read in Data
df = pd.read_csv('AllGames_wAdvancedTracking.csv')
df = df[df.nflId.notna()]

games = pd.read_csv('games.csv')
players = pd.read_csv('players.csv')
plays = pd.read_csv('plays.csv')
tackles = pd.read_csv('tackles.csv')

In [4]:
#Creatin a Unique Key for Each Game
df['key'] = df['gameId'].astype(str) + df['playId'].astype(str)

#Removing all plays that don't involve a Tackle
plays_with_tackles = list(df[df.event == 'tackle']['key'].unique())
print(len(plays_with_tackles), ' Plays with Tackles in DataSet')
df = df[df.key.isin(plays_with_tackles)]

10041  Plays with Tackles in DataSet


### Feature Creation

In [5]:
#Dropping Irrelevant Columns:
columns_to_drop = ['jerseyNumber','playDirection','distanceFromCarrier']
df = df.drop(columns = columns_to_drop)

In [6]:
#Creating a Play Time Feature to see how long into the play it was. 
df['playTime'] = df['frameId']*0.1
df = df.drop(columns = 'time')

In [7]:
#Adding Player Age
players.loc[players.birthDate.str[2] == '/', 'birthDate'] = players['birthDate'].str[-4:]
players['year'] = players['birthDate'].str[0:4].astype(float)
players['age'] = 2022 - players['year']

#Filling Missing Age with Median Age
players['age'] = players['age'].fillna(players['age'].median())

#Converting height to inches
players[['feet','inches']] = players['height'].str.split('-', expand = True)
players['feet'] = players['feet'].astype(int)
players['inches'] = players['inches'].astype(int)
players['height'] = players['feet'] * 12 + players['inches']

#Dropping Irrelevant Columns
players = players.drop(columns = ['birthDate', 'year','feet','inches', 'collegeName','displayName'])

In [8]:
#Merging Player Data and Tracking date
df.shape
df = pd.merge(df, players, on = 'nflId')

(9608126, 21)

In [9]:
#Giving Every Frame of the ballCarrier the right tag
df.loc[df.tag == 'ballCarrier', 'playerType'] = 'ballCarrier'

#Mapping the Actual Tackle Location to Each Frame of the Tracking Data
tackle_x_dict = dict(zip(df[(df.event == 'tackle')&(df.playerType == 'ballCarrier')]['key'], df[(df.event == 'tackle')&(df.playerType == 'ballCarrier')].x))
tackle_y_dict = dict(zip(df[(df.event == 'tackle')&(df.playerType == 'ballCarrier')]['key'], df[(df.event == 'tackle')&(df.playerType == 'ballCarrier')].y))

df['tackle_x'] = df['key'].map(tackle_x_dict)
df['tackle_y'] = df['key'].map(tackle_y_dict)

In [10]:
#Considering Assists Similar to Tackles
df.loc[df.assist == 1, 'tackle'] = 1
#Filling NA Tackles with 0
df['tackle'] = df['tackle'].fillna(0)

#Dropping Forced Fumbles and Missed Tackles, Tags, displayName
df = df.drop(columns = ['forcedFumble','missedTackle','tag', 'displayName', 'assist'])

In [33]:
#Calculating Distances in Parallel
from concurrent.futures import ProcessPoolExecutor
#Calculating Distances in Parallel
keys = df['key'].unique()
with ProcessPoolExecutor(max_workers = 8) as executor:
    play_distances_dfs = list(executor.map(calculate_distances_from_ballCarrier, keys))

In [36]:
distances_df = pd.concat(play_distances_dfs)

In [39]:
df = pd.merge(df, distances_df, on = ['key','frameId','x','y','nflId'], how = 'outer')

In [48]:
df.to_csv('20231031_AllGames_wAdvancedTracking.csv', index = False)

### Data Preprocessing

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9608126 entries, 0 to 9608125
Data columns (total 22 columns):
 #   Column      Dtype  
---  ------      -----  
 0   gameId      int64  
 1   playId      int64  
 2   nflId       float64
 3   frameId     int64  
 4   playerType  object 
 5   x           float64
 6   y           float64
 7   s           float64
 8   a           float64
 9   dis         float64
 10  o           float64
 11  dir         float64
 12  event       object 
 13  tackle      float64
 14  key         object 
 15  playTime    float64
 16  height      int64  
 17  weight      int64  
 18  position    object 
 19  age         float64
 20  tackle_x    float64
 21  tackle_y    float64
dtypes: float64(13), int64(5), object(4)
memory usage: 1.6+ GB


In [12]:
#Sorting by Key and then Frame to have everything be in sequential order
df = df.sort_values(by = ['key', 'frameId'])

In [41]:
#Dropping Bad Vallues
df = df.drop(columns = ['gameId','playId','nflId','playerType','key'])

#One Hot Encode Positions
df = df.join(pd.get_dummies(df['position'], dtype=float))

In [66]:
#Splitting into training data and target variables
X = df.drop(columns=['tackle', 'tackle_x','tackle_y', 'event', 'position'])
y = df[['tackle', 'tackle_x','tackle_y']]
y = y.drop(columns = ['tackle_x','tackle_y'])