In [1]:
import os,math,random, gc,datetime
import platform
import logging
logging.getLogger().setLevel(logging.CRITICAL)

#package imports
import numpy as np
import pandas as pd
from tqdm import tqdm

#plotting imports
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

#local imports
from utils.helpers import find_play_type
from utils.logging import create_logger

#Stats imports
import statsmodels as sm
from scipy import stats

#File paths below
#check kaggle or local
if platform.platform() == 'Linux-5.15.133+-x86_64-with-glibc2.31':
    FILE_PATH = '/kaggle/input/nfl-big-data-bowl-2024'
else:
    FILE_PATH = './nfl-big-data-bowl-2024'
GAMES_PATH = os.path.join(FILE_PATH,'games.csv')
PLAYS_PATH = os.path.join(FILE_PATH,'plays.csv')
PLAYERS_PATH = os.path.join(FILE_PATH,'players.csv')
TACKLES_PATH = os.path.join(FILE_PATH,'tackles.csv')

#Tracking weeks
WEEK1_PATH = os.path.join(FILE_PATH,'tracking_week_1.csv')
WEEK2_PATH = os.path.join(FILE_PATH,'tracking_week_2.csv')
WEEK3_PATH = os.path.join(FILE_PATH,'tracking_week_3.csv')
WEEK4_PATH = os.path.join(FILE_PATH,'tracking_week_4.csv')
WEEK5_PATH = os.path.join(FILE_PATH,'tracking_week_5.csv')
WEEK6_PATH = os.path.join(FILE_PATH,'tracking_week_6.csv')
WEEK7_PATH = os.path.join(FILE_PATH,'tracking_week_7.csv')
WEEK8_PATH = os.path.join(FILE_PATH,'tracking_week_8.csv')
WEEK9_PATH = os.path.join(FILE_PATH,'tracking_week_9.csv')


logger = create_logger('./output/log.txt','basicModel')

Logger initialized: basicModel


### Load in all data

In [2]:
games_df = pd.read_csv(GAMES_PATH)
players_df = pd.read_csv(PLAYERS_PATH)
plays_df = pd.read_csv(PLAYS_PATH)
tackles_df = pd.read_csv(TACKLES_PATH)

#tracking week 1
week1_df = pd.read_csv(WEEK1_PATH)
week2_df = pd.read_csv(WEEK2_PATH)
week3_df = pd.read_csv(WEEK3_PATH)
week4_df = pd.read_csv(WEEK4_PATH)
week5_df = pd.read_csv(WEEK5_PATH)
week6_df = pd.read_csv(WEEK6_PATH)
week7_df = pd.read_csv(WEEK7_PATH)
week8_df = pd.read_csv(WEEK8_PATH)
week9_df = pd.read_csv(WEEK9_PATH)

#concat all weeks
all_weeks = pd.concat([week1_df,week2_df,week3_df,week4_df,week5_df,week6_df,week7_df,week8_df,week9_df])
display(all_weeks.sample(4))

#find all pass plays and run plays
pass_plays = find_play_type(plays_df,'pass')
pass_index = plays_df[plays_df['playDescription'].isin(pass_plays)].index
pass_plays_df = plays_df.iloc[pass_index]
run_plays_df = plays_df.loc[~plays_df.index.isin(pass_index)]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
315332,2022091102,1928,53440.0,Justin Fields,21,2022-09-11 14:21:59.400000,1.0,CHI,right,66.58,20.83,7.17,2.15,0.71,172.6,180.38,
98395,2022091100,762,53577.0,Ta'Quon Graham,11,2022-09-11 13:37:44.200000,95.0,ATL,left,44.84,21.07,1.07,1.8,0.11,91.03,19.97,
206784,2022101601,592,44870.0,Ethan Pocic,50,2022-10-16 13:21:28.500000,55.0,CLE,left,70.02,26.62,1.99,0.67,0.21,244.59,178.16,
182787,2022101600,3318,41291.0,Jimmy Garoppolo,22,2022-10-16 15:41:08.099999,10.0,SF,right,88.75,29.5,1.13,2.92,0.13,224.83,281.27,


Number of matches for pass: 5646


## Some Data Organization

In [3]:
# 8 mins
# tracking_run_index = []
# for i,j in tqdm(zip(run_plays_df['gameId'],run_plays_df['playId'])):
#     ind = all_weeks.loc[(all_weeks['gameId'] == i) & (all_weeks['playId'] == j)].index
#     tracking_run_index = tracking_run_index + ind.to_list()

# 8.5 mins
# for i in tqdm(run_plays_df[['gameId','playId']].itertuples()):
#     ind = all_weeks.loc[(all_weeks['gameId'] == i[1]) & (all_weeks['playId'] == i[2])].index
#     tracking_run_index = tracking_run_index + ind.to_list()
if not os.path.exists('./output/run_play_tracking.csv'):
    logger.info('Creating run play tracking csv...')
    #2 mins
    def apply_func(gameId,playId):
        ind = all_weeks.loc[(all_weeks['gameId'] == gameId) & (all_weeks['playId'] == playId)].index.to_list()
        return ind

    run_plays = [apply_func(x,y) for x,y in tqdm(zip(run_plays_df['gameId'],run_plays_df['playId']))]
    tracking_run_index = [item for sublist in run_plays for item in sublist]

    run_play_tracking = all_weeks.iloc[tracking_run_index]
    run_play_tracking.to_csv('./output/run_play_tracking.csv')
else:
    logger.info('Reading run play tracking csv...')
    run_play_tracking = pd.read_csv('./output/run_play_tracking.csv')

Reading run play tracking csv...


## A model to look at run plays and play results
Will only look at x,y,s,a,dis(or o)

In [52]:
frame_counts = {}
list_of_frames = []

#average frames
for game in tqdm(run_play_tracking['gameId'].unique()):
    frame_counts[game] = {}
    for play in run_play_tracking.loc[run_play_tracking['gameId'] == game]['playId'].unique():
        frames = run_play_tracking[(run_play_tracking['gameId'] == game) & (run_play_tracking['playId'] == play)]['frameId'].mean().astype(int)
        frame_counts[game][play] = frames
        list_of_frames.append(frames)

#trim the amount of plays with less than some percentile of frames. We need every play in a tensor to have the same amount of frames
#list_of_frames.sort()
frame_threshold = np.percentile(list_of_frames, 30)
print(frame_threshold)
        

100%|██████████| 17/17 [00:36<00:00,  2.16s/it]

14.399999999999977





In [56]:
#only plays with more than the threshold selected. then we ONLY take those frames
#this is to make sure that every play has the same amount of frames

keep_plays = {}

for k,v in frame_counts.items():
    keep_plays[k] = []
    for k1,v1 in v.items():
        if v1 > frame_threshold:
            keep_plays[k].append(k1)

keep_plays

{2022091103: [3126,
  253,
  274,
  1672,
  611,
  632,
  4462,
  4631,
  1610,
  520,
  2611,
  2670,
  4154,
  1126,
  1150,
  2291,
  2315,
  852,
  881,
  1493,
  1565,
  3105,
  3825,
  5039,
  1228,
  2483,
  2511,
  1384,
  2719,
  955,
  1016,
  2955,
  1315,
  1363,
  3011,
  1700,
  364,
  388,
  2201,
  2222,
  3755,
  1947,
  1968,
  2037,
  3251,
  4686,
  1037,
  4279,
  4353,
  743,
  764,
  319,
  3613,
  3680,
  1105,
  111,
  2462,
  1926,
  3294,
  3389,
  1589,
  3464,
  4789,
  340,
  2172,
  902,
  4710],
 2022091112: [2370,
  2475,
  3454,
  3492,
  3530,
  1994,
  1613,
  2612,
  2640,
  3416,
  1458,
  2501,
  965,
  86,
  1539,
  573,
  3551,
  2886,
  2072,
  181,
  3627,
  3002,
  3029,
  1144,
  1165,
  2931,
  2981,
  3766,
  1501,
  62,
  2246,
  2294,
  2167,
  2225,
  1260,
  736,
  1560,
  3589,
  3176,
  2315,
  1207,
  3128,
  3672,
  3696,
  917,
  991],
 2022091107: [723,
  3669,
  3443,
  3026,
  1260,
  1378,
  254,
  275,
  2285,
  2314,
  1994,

In [50]:
import torch
import torch.nn as nn

class Preprocess(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Parameter(torch.randn(1,1,3))
        self.l2 = nn.Parameter(torch.randn(1,1,3))
        self.l3 = nn.Parameter(torch.randn(1,1,3))
        
    def forward(self,x):
        x = torch.cat([x,self.l1],dim=1)
        x = torch.cat([x,self.l2],dim=1)
        x = torch.cat([x,self.l3],dim=1)
        return x

#trying to create a tensor
def create_tensor(df):
    for i in df['gameId'].unique():
        for j in df.loc[df['gameId'] == i]['playId'].unique():
            pass
            
