# Data Prep

The purpose of this notebook is to use functions from Feature-Engineering.ipynb and data cleaning functions from Data-Preprocessing.ipynb. At the end we will split the data into 70% training, 15% validation, and 15% testing dataframes that will be used for model building.

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os
import warnings
import time
from sklearn.model_selection import GroupShuffleSplit

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


#Import Data
games = pd.read_csv("../Data/games.csv")
players = pd.read_csv("../Data/players.csv")
plays = pd.read_csv("../Data/plays.csv")
tackles = pd.read_csv("../Data/tackles.csv")

tracking_1 = pd.read_csv("../Data/tracking_week_1.csv")
tracking_2 = pd.read_csv("../Data/tracking_week_2.csv")
tracking_3 = pd.read_csv("../Data/tracking_week_3.csv")
tracking_4 = pd.read_csv("../Data/tracking_week_4.csv")
tracking_5 = pd.read_csv("../Data/tracking_week_5.csv")
tracking_6 = pd.read_csv("../Data/tracking_week_6.csv")
tracking_7 = pd.read_csv("../Data/tracking_week_7.csv")
tracking_8 = pd.read_csv("../Data/tracking_week_8.csv")
tracking_9 = pd.read_csv("../Data/tracking_week_9.csv")

In [2]:
#combine tracking
tracking = pd.concat([tracking_1,tracking_2,tracking_3,tracking_4,tracking_5,tracking_6,tracking_7,tracking_8,tracking_9], axis = 0).reset_index(drop = True)

#TODO: Temporarily make subset of data
tracking = tracking[tracking['gameId'].isin(tracking.gameId.sample(2))]
# tracking = tracking[tracking['gameId'] == 2022091111]
tracking

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
3426006,2022092506,56,41243.0,C.J. Mosley,1,2022-09-25 13:03:23.099999,57.0,NYJ,left,80.610000,25.460000,2.62,5.87,0.23,34.50,350.20,
3426007,2022092506,56,41243.0,C.J. Mosley,2,2022-09-25 13:03:23.200000,57.0,NYJ,left,80.540000,25.750000,3.23,5.70,0.29,19.07,345.02,
3426008,2022092506,56,41243.0,C.J. Mosley,3,2022-09-25 13:03:23.299999,57.0,NYJ,left,80.430000,26.090000,3.82,5.17,0.36,15.88,340.72,
3426009,2022092506,56,41243.0,C.J. Mosley,4,2022-09-25 13:03:23.400000,57.0,NYJ,left,80.280000,26.480000,4.43,4.86,0.42,7.32,337.66,
3426010,2022092506,56,41243.0,C.J. Mosley,5,2022-09-25 13:03:23.500000,57.0,NYJ,left,80.090000,26.920000,4.94,4.19,0.47,352.70,336.33,pass_arrived
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5989190,2022100902,4076,,football,56,2022-10-09 16:00:09.000000,,football,right,73.860001,40.270000,1.19,1.08,0.34,,,tackle
5989191,2022100902,4076,,football,57,2022-10-09 16:00:09.099999,,football,right,73.720001,40.750000,1.47,1.09,0.50,,,
5989192,2022100902,4076,,football,58,2022-10-09 16:00:09.200000,,football,right,73.760002,40.790001,1.24,0.99,0.06,,,
5989193,2022100902,4076,,football,59,2022-10-09 16:00:09.299999,,football,right,73.809998,40.860001,1.04,1.08,0.09,,,


In [153]:
print(len(tracking))

181953


In [154]:
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
len(unique_combinations)

180

In [5]:
#Run Notebooks
# %run Feature-Engineering.ipynb
# %run Data-Preprocessing.ipynb

%run Data-Preprocessing-Feature-Engineering.ipynb

In [12]:
#Filter frames in tracking to only include desired rows
start_time = time.time()
tracking = filter_frames_by_events(tracking)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.21668696403503418 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 54004
Number of plays in data: 95


In [13]:
#Remove football data as we are only concerned with the position of the ball carrier
start_time = time.time()
tracking = remove_football_frames(tracking)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.016697168350219727 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [14]:
#Features for changing the orientation and direction to unit circle
start_time = time.time()
tracking["unitDir"] = tracking["dir"].apply(orient_angle)
tracking["unitO"] = tracking["o"].apply(orient_angle)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.06482386589050293 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [15]:
#Standardaize the tracking data
start_time = time.time()
tracking = standardize_field(tracking)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.028177261352539062 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [16]:
#Create feature for calculating Force
start_time = time.time()
tracking = tracking.merge(calculate_mass_and_force(tracking, players), on = ["gameId", "playId", "nflId", "frameId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.04459238052368164 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [17]:
#Create feature for home binary variable and win probability in terms of the defense
start_time = time.time()
tracking = tracking.merge(presnapDefenseWinProbability(games, tracking, plays), on = ["gameId", "playId","frameId","nflId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.8263900279998779 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [18]:
#Create feature for ball carrier data to each player
start_time = time.time()
tracking = tracking.merge(ballCarrierData(plays,tracking,players), on = ["gameId", "playId","frameId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.08570313453674316 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [19]:
#Create feature for play type
start_time = time.time()
tracking = tracking.merge(play_type(plays,tracking), on = ["gameId","playId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 0.1243906021118164 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [20]:
#Create feature for distances and projections to offensive players
start_time = time.time()
tracking = tracking.merge(calculate_distance_angles(tracking, plays), on = ["gameId", "playId", "nflId", "frameId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 153.11632418632507 seconds
Lastest Date: 2022-09-25 16:14:28.500000
Number of frames in tracking: 51656
Number of plays in data: 95


In [172]:
#Write this dataframe to a file so that it is saved since the above takes a while to run
#tracking.to_csv("../Data/tracking_with_dist.csv", index = False)

In [24]:
#Read in the tracking data with distances
start_time = time.time()
#tracking = pd.read_csv("../Data/tracking_with_dist.csv")
end_time = time.time()
#Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 47.44386434555054 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7393078
Number of plays in data: 12473


In [25]:
#Calculate Voronoi Tessellations and features
start_time = time.time()
tracking = tracking.merge(voronoi_tessellations(tracking, plays), on = ['gameId','playId','frameId','nflId'])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 38947.94886493683 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7393078
Number of plays in data: 12473


In [27]:
#Create additional positional features
start_time = time.time()
tracking['bcx_adj'] = tracking.apply(lambda x: 110 - x['bcx'], axis = 1) # Yards BC is away from the endzone
tracking['bcy_toob'] = tracking.apply(lambda x: 53.3 - x['bcy'] if x['bcy'] >= 26.65 else x['bcy'], axis = 1) # Yards BC is away from the sideline
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 130.6056957244873 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7393078
Number of plays in data: 12473


In [29]:
#Remove plays with tracking issues
start_time = time.time()
tracking = remove_tracking_issues(tracking)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 12.550302028656006 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391472
Number of plays in data: 12470


In [30]:
#Remove plays with multiple tackles on the play
start_time = time.time()
tracking = remove_plays_with_mult_tackles(tracking,tackles)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 11.562179327011108 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391098
Number of plays in data: 12469


In [31]:
# Calculate ingame_tackling metrics
start_time = time.time()
tracking = tracking.merge(ingame_tackling(tracking), how = 'left', on = ['gameId', 'playId', 'nflId'])
end_time = time.time()
#Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 337.87190675735474 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391098
Number of plays in data: 12469


In [32]:
# Calculate rolling_tackling metrics
start_time = time.time()
tracking = tracking.merge(rolling_tackling(), how = 'left', on = ['gameId', 'nflId'])
end_time = time.time()
#Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 7.225990056991577 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391098
Number of plays in data: 12469


In [33]:
#Create feature for defensive formation
start_time = time.time()
tracking = tracking.merge(defense_formation(plays,tackles, tracking,players), on = ["gameId", "playId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 18.826051235198975 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391098
Number of plays in data: 12469


In [34]:
#Create feature for offensive formation
start_time = time.time()
tracking = tracking.merge(offense_formation(plays,tackles, tracking,players), on = ["gameId", "playId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 17.161552906036377 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 7391098
Number of plays in data: 12469


In [35]:
#Remove offensive players from the data
start_time = time.time()
tracking = remove_offensive_players(tracking,plays)
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

Elapsed time: 4.857673168182373 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 3695549
Number of plays in data: 12469


In [36]:
#Create feature for dependent variables
start_time = time.time()
tracking = tracking.merge(tackle_dependent_variable(tackles,tracking), on = ["gameId", "playId", "nflId", "frameId"])
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

done tackle_binary_all
done tackle_binary_single
done tackle_nonbinary_all
done tackle_non_binary_single
Elapsed time: 176.0892870426178 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 3695549
Number of plays in data: 12469


# Merge Tracking with Other Data

In [37]:
#Merge for the rest of the data
start_time = time.time()

#Create feature for plays data to change game clock into numeric format
plays["timeSinceStart"] = plays.apply(nfl_clock_to_seconds,axis = 1)
final_df = tracking.merge(plays[['gameId','playId','down','yardsToGo','defendersInTheBox','offenseFormation','absoluteYardlineNumber', 'timeSinceStart','preSnapHomeScore', 'preSnapVisitorScore']], on =['gameId', 'playId'], how = 'left')

#Gather surface information for games data 
final_df = final_df.merge(game_miscs(), on = 'gameId', how = 'left')

#Create feature for obtaing the score in terms of defense TODO: FUNCTION NOT FOUND
# final_df["presnapDefScoreDiff"] = final_df.apply(presnapDefScoreDiff,axis=1)

#Merge player data
final_df = final_df.merge(players[['nflId', 'weight', 'position']], on = 'nflId', how = 'left')

end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")
print("Lastest Date:",tracking["time"].max())
print("Number of frames in tracking:",len(tracking))
unique_combinations = tracking[["gameId","playId"]].drop_duplicates()
print("Number of plays in data:",len(unique_combinations))

2022 done.
Downcasting floats.
Elapsed time: 11.45948338508606 seconds
Lastest Date: 2022-11-07 23:06:49.200000
Number of frames in tracking: 3695549
Number of plays in data: 12469


In [38]:
display(final_df)

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,dis,o,dir,event,unitDir,unitO,force,home,preSnapWinProbabilityDefense,bcx,bcy,bcs,bca,bco,bcdir,bcweight,bcPosition,bcForce,play_type,c1Dist,c2Dist,c3Dist,c4Dist,c5Dist,c6Dist,c7Dist,c8Dist,c9Dist,c10Dist,bcDist,c1Ang,c2Ang,c3Ang,c4Ang,c5Ang,c6Ang,c7Ang,c8Ang,c9Ang,c10Ang,bcAng,a,s,voronoi_min_dist_from_bc,bcx_adj,bcy_toob,tackles_ingame,assists_ingame,ff_ingame,misses_ingame,tackle_efficiency_ingame,tackle_rating_ingame,rolling_tackles,rolling_assists,rolling_ff,rolling_mt,DL,LB,DB,QB,RB,WR,TE,OL,defensiveTeam,tackle_binary_all,tackle_binary_single,tackle_nonbinary_all,tackle_nonbinary_single,down,yardsToGo,defendersInTheBox,offenseFormation,absoluteYardlineNumber,timeSinceStart,preSnapHomeScore,preSnapVisitorScore,surface,inside_outside,weight,position
0,2.022091e+09,56.0,38577.0,Bobby Wagner,6.0,2022-09-08 20:24:05.700000,45.0,LA,left,41.89,24.593333,0.32,349.47,357.71,pass_outcome_caught,272.29,280.53,288.200000,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,3.195387,10.116071,10.461855,10.882909,12.035414,12.635874,12.701657,13.169210,14.799963,23.582173,7.067538,75.239538,89.059902,122.727777,103.094000,104.570954,166.159966,86.642246,80.903275,77.812451,158.729540,16.542527,2.62,3.35,3.948566,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,LA,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,242,ILB
1,2.022091e+09,56.0,41239.0,Aaron Donald,6.0,2022-09-08 20:24:05.700000,99.0,LA,left,27.85,23.373333,0.37,186.16,157.65,pass_outcome_caught,112.35,83.84,364.000000,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,1.400321,1.783620,2.496898,3.993257,4.414386,4.674409,8.228657,17.168183,21.436532,32.008038,13.527265,113.577579,163.037991,150.980380,102.982031,53.447313,68.130076,59.944064,110.580937,72.972206,65.058378,136.944687,2.86,3.62,12.659069,69.85,17.743333,0,0,0,0,,,,,,,3,2,6,1,1,3,1,5,LA,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,280,DT
2,2.022091e+09,56.0,42816.0,Troy Hill,6.0,2022-09-08 20:24:05.700000,2.0,LA,left,49.38,45.673333,0.27,331.57,278.33,pass_outcome_caught,351.67,298.43,346.254545,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,1.233207,10.014569,22.204274,22.838312,26.325539,26.712411,27.894992,30.064028,31.255438,33.017583,29.415605,89.937075,111.358111,93.020417,127.965101,125.346367,127.777247,120.762863,123.703959,122.414454,123.035494,99.957121,4.14,2.60,23.831392,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,LA,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,184,CB
3,2.022091e+09,56.0,43294.0,Jalen Ramsey,6.0,2022-09-08 20:24:05.700000,5.0,LA,left,41.85,15.483333,0.59,140.96,178.50,pass_outcome_caught,91.50,129.04,116.290909,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,8.993442,13.196030,14.422794,14.850576,15.279797,15.418982,16.539265,16.979061,21.643128,32.342421,2.828003,22.070958,48.171854,61.668459,56.543692,67.764015,42.186107,43.622479,30.430267,8.319643,15.291336,35.450938,1.23,5.88,0.000000,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,LA,1,0,1.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,208,CB
4,2.022091e+09,56.0,43298.0,Leonard Floyd,6.0,2022-09-08 20:24:05.700000,54.0,LA,left,27.89,20.193333,0.13,159.12,203.53,pass_outcome_caught,66.47,110.88,241.090909,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,0.773886,2.104305,3.431049,5.466160,7.240836,7.311580,10.903687,17.517377,23.554390,34.387191,12.502404,48.770529,7.667234,0.177919,21.988835,3.672038,5.689796,3.646058,54.242746,21.039989,15.533146,77.770945,2.21,1.34,5.949168,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,LA,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,240,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695544,2.022111e+09,3787.0,52627.0,Geno Stone,40.0,2022-11-07 23:06:49.200000,26.0,BAL,right,33.31,24.130000,0.50,241.92,217.30,tackle,232.70,208.08,228.136364,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,5.762126,6.157175,8.404154,9.069785,10.327323,13.093854,13.440610,13.880295,14.914801,16.563155,8.332647,120.898591,18.958315,39.143277,34.784455,36.160176,56.158964,17.330810,130.175670,21.064381,50.416303,20.745637,2.39,4.86,6.488457,83.76,19.720000,0,1,0,0,1.0,0.50,18.0,0.0,1.0,1.0,2,4,5,1,1,3,1,5,BAL,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,210,SS
3695545,2.022111e+09,3787.0,53460.0,Odafe Oweh,40.0,2022-11-07 23:06:49.200000,99.0,BAL,right,22.08,24.320000,0.06,60.14,80.83,tackle,9.17,29.86,112.254545,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,1.935355,3.400853,3.745557,3.954795,5.387476,7.096774,7.974572,10.452449,15.686236,16.136409,6.202064,152.769528,76.148384,44.387593,58.065848,179.907675,39.746067,97.229731,20.411704,49.227284,73.224253,57.045454,0.98,0.54,4.334894,83.76,19.720000,1,1,0,0,1.0,0.75,4.0,2.0,0.0,2.0,2,4,5,1,1,3,1,5,BAL,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,252,OLB
3695546,2.022111e+09,3787.0,53533.0,Brandon Stephens,40.0,2022-11-07 23:06:49.200000,21.0,BAL,right,33.19,38.170000,0.56,190.20,188.24,tackle,261.76,259.80,38.390909,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,2.931245,8.921687,17.919894,18.161817,18.527412,18.859189,19.595122,22.046199,24.364277,28.647724,19.715603,72.137006,4.846074,18.453741,7.740063,36.103970,18.583200,21.700726,39.940849,18.177816,0.112726,12.401079,0.41,5.59,15.490166,83.76,19.720000,2,0,0,0,1.0,1.00,5.0,2.0,0.0,0.0,2,4,5,1,1,3,1,5,BAL,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,206,FS
3695547,2.022111e+09,3787.0,54541.0,Travis Jones,40.0,2022-11-07 23:06:49.200000,98.0,BAL,right,24.78,19.810000,0.18,134.33,153.81,tackle,296.19,315.67,418.472727,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,1.533264,1.944557,2.377415,3.526769,4.228061,6.835474,8.815668,10.909152,11.590557,18.703136,1.462771,157.549503,161.398347,145.100484,78.594908,61.270834,164.570383,140.720017,2.632798,120.353008,136.644207,60.282532,2.74,1.74,0.000000,83.76,19.720000,1,0,0,0,1.0,1.00,5.0,1.0,0.0,0.0,2,4,5,1,1,3,1,5,BAL,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,336,DT


# Processing for models datasets

In [39]:
pd.set_option('display.max_columns', None)
columns_to_drop = ['displayName','time','jerseyNumber','club','playDirection','defensiveTeam','dir','o','event', 'dis']

#Return df with only desired columns
final_df = final_df.drop(columns=columns_to_drop)
display(final_df)

Unnamed: 0,gameId,playId,nflId,frameId,x,y,unitDir,unitO,force,home,preSnapWinProbabilityDefense,bcx,bcy,bcs,bca,bco,bcdir,bcweight,bcPosition,bcForce,play_type,c1Dist,c2Dist,c3Dist,c4Dist,c5Dist,c6Dist,c7Dist,c8Dist,c9Dist,c10Dist,bcDist,c1Ang,c2Ang,c3Ang,c4Ang,c5Ang,c6Ang,c7Ang,c8Ang,c9Ang,c10Ang,bcAng,a,s,voronoi_min_dist_from_bc,bcx_adj,bcy_toob,tackles_ingame,assists_ingame,ff_ingame,misses_ingame,tackle_efficiency_ingame,tackle_rating_ingame,rolling_tackles,rolling_assists,rolling_ff,rolling_mt,DL,LB,DB,QB,RB,WR,TE,OL,tackle_binary_all,tackle_binary_single,tackle_nonbinary_all,tackle_nonbinary_single,down,yardsToGo,defendersInTheBox,offenseFormation,absoluteYardlineNumber,timeSinceStart,preSnapHomeScore,preSnapVisitorScore,surface,inside_outside,weight,position
0,2.022091e+09,56.0,38577.0,6.0,41.89,24.593333,272.29,280.53,288.200000,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,3.195387,10.116071,10.461855,10.882909,12.035414,12.635874,12.701657,13.169210,14.799963,23.582173,7.067538,75.239538,89.059902,122.727777,103.094000,104.570954,166.159966,86.642246,80.903275,77.812451,158.729540,16.542527,2.62,3.35,3.948566,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,242,ILB
1,2.022091e+09,56.0,41239.0,6.0,27.85,23.373333,112.35,83.84,364.000000,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,1.400321,1.783620,2.496898,3.993257,4.414386,4.674409,8.228657,17.168183,21.436532,32.008038,13.527265,113.577579,163.037991,150.980380,102.982031,53.447313,68.130076,59.944064,110.580937,72.972206,65.058378,136.944687,2.86,3.62,12.659069,69.85,17.743333,0,0,0,0,,,,,,,3,2,6,1,1,3,1,5,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,280,DT
2,2.022091e+09,56.0,42816.0,6.0,49.38,45.673333,351.67,298.43,346.254545,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,1.233207,10.014569,22.204274,22.838312,26.325539,26.712411,27.894992,30.064028,31.255438,33.017583,29.415605,89.937075,111.358111,93.020417,127.965101,125.346367,127.777247,120.762863,123.703959,122.414454,123.035494,99.957121,4.14,2.60,23.831392,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,184,CB
3,2.022091e+09,56.0,43294.0,6.0,41.85,15.483333,91.50,129.04,116.290909,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,8.993442,13.196030,14.422794,14.850576,15.279797,15.418982,16.539265,16.979061,21.643128,32.342421,2.828003,22.070958,48.171854,61.668459,56.543692,67.764015,42.186107,43.622479,30.430267,8.319643,15.291336,35.450938,1.23,5.88,0.000000,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,1,0,1.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,208,CB
4,2.022091e+09,56.0,43298.0,6.0,27.89,20.193333,66.47,110.88,241.090909,1,0.413347,40.15,17.743333,4.61,4.82,114.27,202.20,191,WR,418.463636,pass,0.773886,2.104305,3.431049,5.466160,7.240836,7.311580,10.903687,17.517377,23.554390,34.387191,12.502404,48.770529,7.667234,0.177919,21.988835,3.672038,5.689796,3.646058,54.242746,21.039989,15.533146,77.770945,2.21,1.34,5.949168,69.85,17.743333,0,0,0,0,,,0.0,0.0,0.0,0.0,3,2,6,1,1,3,1,5,0,0,0.0,0.0,1,10,6.0,SHOTGUN,85,0,0,0,turf,inside,240,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695544,2.022111e+09,3787.0,52627.0,40.0,33.31,24.130000,232.70,208.08,228.136364,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,5.762126,6.157175,8.404154,9.069785,10.327323,13.093854,13.440610,13.880295,14.914801,16.563155,8.332647,120.898591,18.958315,39.143277,34.784455,36.160176,56.158964,17.330810,130.175670,21.064381,50.416303,20.745637,2.39,4.86,6.488457,83.76,19.720000,0,1,0,0,1.0,0.50,18.0,0.0,1.0,1.0,2,4,5,1,1,3,1,5,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,210,SS
3695545,2.022111e+09,3787.0,53460.0,40.0,22.08,24.320000,9.17,29.86,112.254545,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,1.935355,3.400853,3.745557,3.954795,5.387476,7.096774,7.974572,10.452449,15.686236,16.136409,6.202064,152.769528,76.148384,44.387593,58.065848,179.907675,39.746067,97.229731,20.411704,49.227284,73.224253,57.045454,0.98,0.54,4.334894,83.76,19.720000,1,1,0,0,1.0,0.75,4.0,2.0,0.0,2.0,2,4,5,1,1,3,1,5,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,252,OLB
3695546,2.022111e+09,3787.0,53533.0,40.0,33.19,38.170000,261.76,259.80,38.390909,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,2.931245,8.921687,17.919894,18.161817,18.527412,18.859189,19.595122,22.046199,24.364277,28.647724,19.715603,72.137006,4.846074,18.453741,7.740063,36.103970,18.583200,21.700726,39.940849,18.177816,0.112726,12.401079,0.41,5.59,15.490166,83.76,19.720000,2,0,0,0,1.0,1.00,5.0,2.0,0.0,0.0,2,4,5,1,1,3,1,5,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,206,FS
3695547,2.022111e+09,3787.0,54541.0,40.0,24.78,19.810000,296.19,315.67,418.472727,0,0.992596,26.24,19.720000,1.60,4.04,103.01,121.39,223,RB,409.509091,run,1.533264,1.944557,2.377415,3.526769,4.228061,6.835474,8.815668,10.909152,11.590557,18.703136,1.462771,157.549503,161.398347,145.100484,78.594908,61.270834,164.570383,140.720017,2.632798,120.353008,136.644207,60.282532,2.74,1.74,0.000000,83.76,19.720000,1,0,0,0,1.0,1.00,5.0,1.0,0.0,0.0,2,4,5,1,1,3,1,5,0,0,0.0,0.0,1,10,4.0,SHOTGUN,21,3577,13,27,turf,inside,336,DT


In [48]:
#fill NA values properly
final_df["rolling_tackles"] = final_df["rolling_tackles"].fillna(0)
final_df["rolling_assists"] = final_df["rolling_assists"].fillna(0)
final_df["rolling_ff"] = final_df["rolling_ff"].fillna(0)
final_df["rolling_mt"] = final_df["rolling_mt"].fillna(0)
final_df["voronoi_min_dist_from_bc"] = final_df["voronoi_min_dist_from_bc"].fillna(final_df["voronoi_min_dist_from_bc"].median())

#OffenseFormation
top_play = final_df["offenseFormation"].value_counts().index[0]
final_df['offenseFormation'].fillna(top_play, inplace=True)

#Defenders- fill with median
median_defendersInTheBox = final_df['defendersInTheBox'].median()
final_df['defendersInTheBox'].fillna(median_defendersInTheBox, inplace=True)

In [49]:
#Split data into train, validation, test

#Create unique id column
final_df['gamePlayId'] = final_df.apply(lambda x: str(x['gameId']) + str(x['playId']), axis=1)
unique_ids = final_df['gamePlayId']

#Create X dataset and Y dataset
x = final_df

#Initialize GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=0.3, random_state=42)

# Split data into training, validation, and testing
for train_idx, test_idx in gss.split(x, groups=unique_ids):
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]

# Further split the testing set into validation and testing
gss_val = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for val_idx, test_idx in gss_val.split(x_test, groups=unique_ids[test_idx]):
    x_val, x_test = x_test.iloc[val_idx], x_test.iloc[test_idx]

In [50]:
#Examine shapes and ensure no unique IDs are in other splits
print("xtrain: ", x_train.shape, "\nxval:  ", x_val.shape, "\nxtest: ", x_test.shape)
# display(x_train[x_train['gamePlayId'].isin(x_test['gamePlayId'])])
# display(x_val[x_val['gamePlayId'].isin(x_train['gamePlayId'])])
# display(x_test[x_test['gamePlayId'].isin(x_train['gamePlayId'])])

xtrain:  (2581854, 83) 
xval:   (556974, 83) 
xtest:  (556721, 83)


In [51]:
unique_combinations = x_train[["gameId","playId"]].drop_duplicates()
print("Number of plays in training:",len(unique_combinations))

Number of plays in training: 8728


In [52]:
unique_combinations = x_val[["gameId","playId"]].drop_duplicates()
print("Number of plays in validation:",len(unique_combinations))

Number of plays in validation: 1870


In [53]:
unique_combinations = x_test[["gameId","playId"]].drop_duplicates()
print("Number of plays in test:",len(unique_combinations))

Number of plays in test: 1871


In [54]:
#get a sample of the train
#Initialize GroupShuffleSplit
unique_ids = x_train['gamePlayId']
gss = GroupShuffleSplit(n_splits=2, test_size=0.85, random_state=42)
for train_idx, test_idx in gss.split(x_train, groups=unique_ids):
    x_train_sample,_ = x_train.iloc[train_idx], x_train.iloc[test_idx]
    
#get a sample of the test
unique_ids = x_val['gamePlayId']
for train_idx, test_idx in gss.split(x_val, groups=unique_ids):
    x_val_sample, _ = x_val.iloc[train_idx], x_val.iloc[test_idx]

In [55]:
print("xtrain_sample: ", x_train_sample.shape, "xval_sample: ", x_val_sample.shape)

xtrain_sample:  (382800, 83) xval_sample:  (82324, 83)


In [56]:
unique_combinations = x_train_sample[["gameId","playId"]].drop_duplicates()
print("Number of plays in train sample:",len(unique_combinations))

Number of plays in train sample: 1309


In [57]:
unique_combinations = x_val_sample[["gameId","playId"]].drop_duplicates()
print("Number of plays in val sample:",len(unique_combinations))

Number of plays in val sample: 280


In [58]:
#Now obtain dataframes with synthetic data
#x_train_synthetic = synthetic_data(x_train)
#x_train_sample_synthetic = synthetic_data(x_train_sample)

NameError: name 'synthetic_data' is not defined

In [None]:
display(x_train_synthetic)

In [59]:
#Write dataframes to csv files
final_df.to_csv("../Data/clean_tracking.csv", index = False)

In [60]:
x_train.to_csv("../Data/train.csv", index = False)

In [61]:
x_val.to_csv("../Data/val.csv", index = False)

In [62]:
x_test.to_csv("../Data/test.csv", index = False)

In [63]:
x_train_sample.to_csv("../Data/train_sample.csv", index = False)

In [64]:
x_val_sample.to_csv("../Data/val_sample.csv", index = False)

In [238]:
#x_train_synthetic.to_csv("../Data/train_synthetic.csv", index = False)

In [239]:
#x_train_sample_synthetic.to_csv("../Data/train_sample_synthetic.csv", index = False)