In [1]:
import os,math,random, gc,datetime
import platform
import logging
logging.getLogger().setLevel(logging.CRITICAL)

#package imports
import numpy as np
import pandas as pd
from tqdm import tqdm

#plotting imports
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

#local imports
from utils.helpers import find_play_type
from utils.logging import create_logger

#Stats imports
import statsmodels as sm
from scipy import stats

#File paths below
#check kaggle or local
if platform.platform() == 'Linux-5.15.133+-x86_64-with-glibc2.31':
    FILE_PATH = '/kaggle/input/nfl-big-data-bowl-2024'
else:
    FILE_PATH = './nfl-big-data-bowl-2024'
GAMES_PATH = os.path.join(FILE_PATH,'games.csv')
PLAYS_PATH = os.path.join(FILE_PATH,'plays.csv')
PLAYERS_PATH = os.path.join(FILE_PATH,'players.csv')
TACKLES_PATH = os.path.join(FILE_PATH,'tackles.csv')

#Tracking weeks
WEEK1_PATH = os.path.join(FILE_PATH,'tracking_week_1.csv')
WEEK2_PATH = os.path.join(FILE_PATH,'tracking_week_2.csv')
WEEK3_PATH = os.path.join(FILE_PATH,'tracking_week_3.csv')
WEEK4_PATH = os.path.join(FILE_PATH,'tracking_week_4.csv')
WEEK5_PATH = os.path.join(FILE_PATH,'tracking_week_5.csv')
WEEK6_PATH = os.path.join(FILE_PATH,'tracking_week_6.csv')
WEEK7_PATH = os.path.join(FILE_PATH,'tracking_week_7.csv')
WEEK8_PATH = os.path.join(FILE_PATH,'tracking_week_8.csv')
WEEK9_PATH = os.path.join(FILE_PATH,'tracking_week_9.csv')


logger = create_logger('/output/log.txt')

### Load in all data

In [3]:
games_df = pd.read_csv(GAMES_PATH)
players_df = pd.read_csv(PLAYERS_PATH)
plays_df = pd.read_csv(PLAYS_PATH)
tackles_df = pd.read_csv(TACKLES_PATH)

#tracking week 1
week1_df = pd.read_csv(WEEK1_PATH)
week2_df = pd.read_csv(WEEK2_PATH)
week3_df = pd.read_csv(WEEK3_PATH)
week4_df = pd.read_csv(WEEK4_PATH)
week5_df = pd.read_csv(WEEK5_PATH)
week6_df = pd.read_csv(WEEK6_PATH)
week7_df = pd.read_csv(WEEK7_PATH)
week8_df = pd.read_csv(WEEK8_PATH)
week9_df = pd.read_csv(WEEK9_PATH)

#concat all weeks
all_weeks = pd.concat([week1_df,week2_df,week3_df,week4_df,week5_df,week6_df,week7_df,week8_df,week9_df])
display(all_weeks.sample(4))

#find all pass plays and run plays
pass_plays = find_play_type(plays_df,'pass')
pass_index = plays_df[plays_df['playDescription'].isin(pass_plays)].index
pass_plays_df = plays_df.iloc[pass_index]
run_plays_df = plays_df.loc[~plays_df.index.isin(pass_index)]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
834852,2022091108,216,47891.0,Julian Love,10,2022-09-11 16:33:54.099999,20.0,NYG,left,14.66,6.46,7.81,1.58,0.79,119.03,139.67,
1135092,2022101611,3061,46088.0,Leighton Vander Esch,63,2022-10-16 22:46:02.299999,55.0,DAL,right,54.24,27.94,1.65,1.43,0.17,293.48,328.59,
388946,2022092503,621,48476.0,Nik Needham,28,2022-09-25 13:27:44.599999,40.0,MIA,right,33.91,12.95,6.52,4.21,0.66,17.76,53.43,
143559,2022102300,2194,41363.0,Brent Urban,3,2022-10-23 14:52:18.599999,97.0,BAL,right,41.2,26.77,0.0,0.0,0.0,287.38,142.85,


Number of matches for pass: 5646


## Some Data Organization

In [8]:
run_play_tracking = all_weeks[(all_weeks['gameId'].isin(run_plays_df['gameId'])) & (all_weeks['playId'].isin(run_plays_df['playId']))]
run_play_tracking

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.370000,27.270000,1.62,1.15,0.16,231.74,147.90,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.470000,27.130000,1.67,0.61,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.560000,27.010000,1.57,0.49,0.15,230.98,147.05,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.640000,26.900000,1.44,0.89,0.14,232.38,145.42,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.720000,26.800000,1.29,1.24,0.13,233.36,141.95,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150018,2022110700,3787,,football,40,2022-11-07 23:06:49.200000,,football,right,26.219999,19.680000,1.37,2.58,0.15,,,tackle
1150019,2022110700,3787,,football,41,2022-11-07 23:06:49.299999,,football,right,26.320000,19.610001,1.07,2.74,0.12,,,
1150020,2022110700,3787,,football,42,2022-11-07 23:06:49.400000,,football,right,26.389999,19.559999,0.80,2.49,0.09,,,
1150021,2022110700,3787,,football,43,2022-11-07 23:06:49.500000,,football,right,26.450001,19.520000,0.57,2.38,0.07,,,


In [46]:
# 8 mins
# tracking_run_index = []
# for i,j in tqdm(zip(run_plays_df['gameId'],run_plays_df['playId'])):
#     ind = all_weeks.loc[(all_weeks['gameId'] == i) & (all_weeks['playId'] == j)].index
#     tracking_run_index = tracking_run_index + ind.to_list()

# 8.5 mins
# for i in tqdm(run_plays_df[['gameId','playId']].itertuples()):
#     ind = all_weeks.loc[(all_weeks['gameId'] == i[1]) & (all_weeks['playId'] == i[2])].index
#     tracking_run_index = tracking_run_index + ind.to_list()
if not os.path.exists('./output/run_play_tracking.csv'):
    print('Creating run play tracking csv...')
    #2 mins
    def apply_func(gameId,playId):
        ind = all_weeks.loc[(all_weeks['gameId'] == gameId) & (all_weeks['playId'] == playId)].index.to_list()
        return ind

    run_plays = [apply_func(x,y) for x,y in tqdm(zip(run_plays_df['gameId'],run_plays_df['playId']))]
    tracking_run_index = [item for sublist in run_plays for item in sublist]

    run_play_tracking = all_weeks.iloc[tracking_run_index]
    run_play_tracking.to_csv('./output/run_play_tracking.csv')
else:
    print('Reading run play tracking csv...')
    run_play_tracking = pd.read_csv('./output/run_play_tracking.csv')

Reading run play tracking csv...


## A model to look at run plays and play results
Will only look at x,y,s,a,dis(or o)