# Data Wrangling Section

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import math
from scipy.spatial import distance

print('done')

done


First, I will merge all the weekly data into one dataframe.

In [2]:
df_w1 = pd.read_csv('Data/week1.csv')
df_w2 = pd.read_csv('Data/week2.csv')
df_w3 = pd.read_csv('Data/week3.csv')
df_w4 = pd.read_csv('Data/week4.csv')
df_w5 = pd.read_csv('Data/week5.csv')
df_w6 = pd.read_csv('Data/week6.csv')
df_w7 = pd.read_csv('Data/week7.csv')
df_w8 = pd.read_csv('Data/week8.csv')
print('done')

done


In [3]:
df_weekly = pd.concat([df_w1,df_w2, df_w3, df_w4, df_w5, df_w6, df_w7, df_w8], axis=0, join='outer')
rows = df_w1.shape[0]+ df_w2.shape[0] + df_w3.shape[0] + df_w4.shape[0] + df_w5.shape[0] + df_w6.shape[0] + df_w7.shape[0] + df_w8.shape[0]
print('Weekly DF Shape: ', df_weekly.shape)
print('Count of rows: ', rows)

Weekly DF Shape:  (8314178, 16)
Count of rows:  8314178


I've merged all the weekly play by play game data and I've confirmed the shape of the dataframe is accurate. Now, I need to add data to the weekly dataframe.

Since my goal is to estimate the expected receving yards on a play, I need to know which players are eligible to receive the ball. Thankfully, the pffScoutingData dataset tracks this stat with the 'pff_role' column . I will use the 'nflId', 'gameId' and 'playid' to merge the 2 dataframes. I will drop the rest of the columns that I do not need to save memory.

In [4]:
df_pff = pd.read_csv('Data/pffScoutingData.csv')
print('done')

done


In [5]:
# df_pff.drop(['pff_hit', 'pff_hurry', 'pff_sack', 'pff_beatenByDefender', 'pff_hitAllowed', 'pff_hurryAllowed', 'pff_sackAllowed', 'pff_nflIdBlockedPlayer', 'pff_blockType', 'pff_backFieldBlock'], inplace=True, axis=1)
# df_pff.columns

In the play by play tracking data, the football is tracked as NaN and since the pandas merge method will drop NaN values, I will create two separate dataframes one with the football tracking time and the other without the football tracking times. I will merge the pff dataframe with the non-football dataframe and then use the concat function to put the data back in to main dataframe.

In [6]:
df_football = df_weekly[df_weekly['team'] == 'football']

In [7]:
df_main = df_pff.merge(df_weekly[pd.notnull(df_weekly.nflId)], on=['gameId', 'playId', 'nflId'])
df_main.shape

(7952692, 28)

In [8]:
df_main = pd.concat([df_main,df_football])
df_main

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,...,team,playDirection,x,y,s,a,dis,o,dir,event
0,2021090900,97,25511.0,Pass,QB,,,,,,...,TB,right,37.77,24.22,0.29,0.30,0.03,165.16,84.99,
1,2021090900,97,25511.0,Pass,QB,,,,,,...,TB,right,37.78,24.22,0.23,0.11,0.02,164.33,92.87,
2,2021090900,97,25511.0,Pass,QB,,,,,,...,TB,right,37.78,24.24,0.16,0.10,0.01,160.24,68.55,
3,2021090900,97,25511.0,Pass,QB,,,,,,...,TB,right,37.73,24.25,0.15,0.24,0.06,152.13,296.85,
4,2021090900,97,25511.0,Pass,QB,,,,,,...,TB,right,37.69,24.26,0.25,0.18,0.04,148.33,287.55,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978944,2021110100,4433,,,,,,,,,...,football,right,24.17,27.89,2.47,2.28,0.25,,,
978945,2021110100,4433,,,,,,,,,...,football,right,24.36,28.03,2.36,2.16,0.24,,,
978946,2021110100,4433,,,,,,,,,...,football,right,24.55,28.17,2.25,1.45,0.23,,,
978947,2021110100,4433,,,,,,,,,...,football,right,24.73,28.31,2.28,0.72,0.23,,,


The pff and weekly dataset have been merged. Now, the 'plays' dataset needs to be merged with this current dataset. 

In [9]:
df_plays = pd.read_csv('Data/plays.csv')
print(df_plays.shape)
df_plays['dropBackType'].value_counts()
# df_plays = df_plays[['gameId', 'playId', 'passResult', 'penaltyYards', 'prePenaltyPlayResult', 'playResult', 'absoluteYardlineNumber', 'pff_playAction', 'pff_passCoverage', 'dropBackType']]
print(df_plays.shape)
df_plays.columns

(8557, 32)
(8557, 32)


Index(['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo',
       'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber',
       'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult', 'foulName1',
       'foulNFLId1', 'foulName2', 'foulNFLId2', 'foulName3', 'foulNFLId3',
       'absoluteYardlineNumber', 'offenseFormation', 'personnelO',
       'defendersInBox', 'personnelD', 'dropBackType', 'pff_playAction',
       'pff_passCoverage', 'pff_passCoverageType'],
      dtype='object')

In [10]:
df_main = df_main.merge(df_plays, on=['gameId', 'playId'])
df_main.shape

(8314178, 58)

I've added 8 new columns to the existing dataframe using 'gameId' and 'playId'. Now, I will clean up the main dataframe to get it ready for the EDA section.

Dropping the columns I do not need going forward.

In [11]:
df_main.drop(['jerseyNumber', 'playDirection'], inplace=True, axis=1)

In [12]:
pd.set_option('display.max_columns', None)
df_main.head()

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,pff_hurryAllowed,pff_sackAllowed,pff_nflIdBlockedPlayer,pff_blockType,pff_backFieldBlock,frameId,time,team,x,y,s,a,dis,o,dir,event,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,penaltyYards,prePenaltyPlayResult,playResult,foulName1,foulNFLId1,foulName2,foulNFLId2,foulName3,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType
0,2021090900,97,25511.0,Pass,QB,,,,,,,,,,,1,2021-09-10T00:26:31.100,TB,37.77,24.22,0.29,0.3,0.03,165.16,84.99,,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
1,2021090900,97,25511.0,Pass,QB,,,,,,,,,,,2,2021-09-10T00:26:31.200,TB,37.78,24.22,0.23,0.11,0.02,164.33,92.87,,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
2,2021090900,97,25511.0,Pass,QB,,,,,,,,,,,3,2021-09-10T00:26:31.300,TB,37.78,24.24,0.16,0.1,0.01,160.24,68.55,,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
3,2021090900,97,25511.0,Pass,QB,,,,,,,,,,,4,2021-09-10T00:26:31.400,TB,37.73,24.25,0.15,0.24,0.06,152.13,296.85,,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man
4,2021090900,97,25511.0,Pass,QB,,,,,,,,,,,5,2021-09-10T00:26:31.500,TB,37.69,24.26,0.25,0.18,0.04,148.33,287.55,,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man


I will check the x,y,s,a,dis,o,dir columns to make sure there are no values that do not fit their respective criteria

In [13]:
# Method to check the min and max number values of each column
def find_min_max(col, df):
    sub_arr = df[col].unique()
    unique, counts = np.unique(sub_arr, return_counts=True)
    unique_min = min(unique)
    unique_max = max(unique)
    print('min: ', unique_min)
    print('max: ', unique_max)

In [14]:
df_main = df_main[(df_main['x'] >= 0) & (df_main['x'] <= 120)]
print(df_main.shape)
find_min_max('x', df_main)

(8314066, 56)
min:  0.0
max:  120.0


In [15]:
df_main = df_main[(df_main['y'] >= 0) & (df_main['y'] <= 53.3)]
find_min_max('y', df_main)

min:  0.0
max:  53.3


In [16]:
find_min_max('o', df_main)

min:  0.0
max:  360.0


In [17]:
find_min_max('dir', df_main)

min:  0.0
max:  360.0


In [18]:
sub_arr = df_main['s'].unique()
unique, counts = np.unique(sub_arr, return_counts=True)
unique_min = min(unique)
print('min: ', unique_min)

min:  0.0


In [19]:
sub_arr = df_main['a'].unique()
unique, counts = np.unique(sub_arr, return_counts=True)
unique_min = min(unique)
print('min: ', unique_min)

min:  0.0


In [20]:
sub_arr = df_main['dis'].unique()
unique, counts = np.unique(sub_arr, return_counts=True)
unique_min = min(unique)
print('min: ', unique_min)

min:  0.0


I've confirmed that x,y,s,a,dis,o,dir columns all have accurate data now, I've dropped rows in columns that do not fit the criteria laid out by the NFL that came with the dataset. I will investigate the rest of the columns in the dataset to confirm they fit the criteria as well.

In [21]:
print('FrameID: ', find_min_max('frameId', df_main))
print('prePenaltyPlayResult: ', find_min_max('prePenaltyPlayResult', df_main))
print('penaltyYards: ', find_min_max('penaltyYards', df_main))
print('playResult: ', find_min_max('playResult', df_main))
print('pff_playAction: ', find_min_max('pff_playAction', df_main))

min:  1
max:  203
FrameID:  None
min:  -34
max:  91
prePenaltyPlayResult:  None
min:  -18.0
max:  50.0
penaltyYards:  None
min:  -34
max:  91
playResult:  None
min:  0
max:  1
pff_playAction:  None


In [22]:
count = 0
count_instances = 0
play_list = []
game_list = []
for game in df_main['gameId'].unique():
    df = df_main[df_main['gameId'] == game]
    unique_plays = df['playId'].unique()
    for play in unique_plays:
        df1 = df[df['playId'] == play]
        frames = df1['frameId'].unique()
        for frame in frames:
            df2 = df1[df1['frameId'] == frame]
            if df2.shape[0] == 23:
                pass
            else:
                count_instances += df2.shape[0]
                if (game in game_list) &( play in play_list):
                    pass
                else:
                    play_list.append(play)
                    game_list.append(game)
                    count += 1
                    
print(count)
print(count_instances)

94
19159


All the min and max units above are accurate.

In [23]:
df_main = df_main.drop(df_main[(df_main['gameId'].isin(game_list)) & (df_main['playId'].isin(play_list))].index)
df_main.shape

(8051817, 56)

I will confirm that each frame has 23 "players" or nflIds. 11 players on each team and an extra row for the football.

In [24]:
# count = 0
# count_instances = 0
# play_list = []
# game_list = []
# for game in df_main['gameId'].unique():
#     df = df_main[df_main['gameId'] == game]
#     unique_plays = df['playId'].unique()
#     for play in unique_plays:
#         df1 = df[df['playId'] == play]
#         frames = df1['frameId'].unique()
#         for frame in frames:
#             df2 = df1[df1['frameId'] == frame]
#             if df2.shape[0] == 23:
#                 pass
#             else:
#                 count_instances += df2.shape[0]
#                 if (game in game_list) &( play in play_list):
#                     pass
#                 else:
#                     play_list.append(play)
#                     game_list.append(game)
#                     count += 1
                    
# print(count)
# print(count_instances)

In [25]:
df_main['event'].value_counts()

None                         7426401
ball_snap                     191383
pass_forward                  169763
autoevent_ballsnap             84594
autoevent_passforward          84180
play_action                    44390
run                            10534
qb_sack                         9729
pass_arrived                    8303
autoevent_passinterrupted       4531
man_in_motion                   3956
line_set                        3174
shift                           2898
pass_tipped                     2530
first_contact                   1702
qb_strip_sack                   1311
pass_outcome_incomplete          874
pass_outcome_caught              483
fumble                           391
fumble_offense_recovered         253
handoff                          207
huddle_break_offense              69
tackle                            69
penalty_flag                      46
lateral                           23
dropped_pass                      23
Name: event, dtype: int64

In [26]:
df_main.loc[df_main["event"] == "autoevent_ballsnap", "event"] = 'ball_snap'

The dataframe now has accurate player numbers for each frame and play. Now is the time to deal with NaN values.

In [27]:
df_main.isna().sum()

gameId                          0
playId                          0
nflId                      350079
pff_role                   350079
pff_positionLinedUp        350079
pff_hit                   4200948
pff_hurry                 4200948
pff_sack                  4200948
pff_beatenByDefender      6069786
pff_hitAllowed            6069786
pff_hurryAllowed          6069786
pff_sackAllowed           6069786
pff_nflIdBlockedPlayer    6131495
pff_blockType             6077379
pff_backFieldBlock        6077438
frameId                         0
time                            0
team                            0
x                               0
y                               0
s                               0
a                               0
dis                             0
o                          350079
dir                        350079
event                           0
playDescription                 0
quarter                         0
down                            0
yardsToGo     

According to the NFL, the tracking of 'football' doesn't include nflId, pff_role, pff_positionLinedUp, o, dir. For o and dir, I will 0 out the NaNs. NflId, pff_role, and pff_positionLinedUp will be changed in the pre-processing step of the process. I will need the information for the exploratory data analysis step. 

First, I am going to confirm that the NaNs in the dataset are all related to the 'football' tracking rows, and then 0 out the fields.

In [28]:
df_main[df_main['team'] == 'football'].isna().sum()

gameId                         0
playId                         0
nflId                     350079
pff_role                  350079
pff_positionLinedUp       350079
pff_hit                   350079
pff_hurry                 350079
pff_sack                  350079
pff_beatenByDefender      350079
pff_hitAllowed            350079
pff_hurryAllowed          350079
pff_sackAllowed           350079
pff_nflIdBlockedPlayer    350079
pff_blockType             350079
pff_backFieldBlock        350079
frameId                        0
time                           0
team                           0
x                              0
y                              0
s                              0
a                              0
dis                            0
o                         350079
dir                       350079
event                          0
playDescription                0
quarter                        0
down                           0
yardsToGo                      0
possession

In [29]:
df_main[["o","dir"]] = df_main[["o","dir"]].fillna(0)
df_main['penaltyYards'] = df_main['penaltyYards'].fillna(0)

The penalty yards row is needed for further analysis in the future. But when there is no penalty on a play, it is tracked as NaN, I will replace them with 0s since there was 0 penalty yardage on the play.

The dropBackType field is only needed to drop all plays that were 'designed_run's. Designed Runs do not effect an analysis on passing plays. I will drop all fields where the play is a designed run, and then make sure that each frame/play has 23 participants.

In [30]:
df_main = df_main[df_main['dropBackType'] != 'DESIGNED_RUN']
df_main.shape

(8047378, 56)

In [31]:
# pass_route_list = []

# play_list = []
# game_list = []
# for game in df_main['gameId'].unique():
#     df = df_main[df_main['gameId'] == game]
#     unique_plays = df['playId'].unique()
#     for play in unique_plays:
#         df1 = df[df['playId'] == play]
#         frames = df1['frameId'].unique()
#         for frame in frames:
#             df2 = (df1[df1['frameId'] == frame]) 
#             pass_route_list.append(df2[df2['pff_role'] == 'Pass Route'].shape[0])

# # print(pass_route_list)
# print(max(pass_route_list))

In [32]:
def calculateDistance(x1,y1,x2,y2):  
     dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
     return dist

In [33]:
def closest_node(center_point, surrounding_points):
    closest_index = distance.cdist([center_point], surrounding_points).argmin()
    return surrounding_points[closest_index]

In [34]:
def addCatchSeparation(dfcatch):
    ball_point = (dfcatch[(dfcatch['team']=='football')]['x'].values[0], dfcatch[(dfcatch['team']=='football')]['y'].values[0])

    
    off_team = dfcatch[dfcatch['pff_positionLinedUp'] == 'QB'].iloc[0]['team']
    def_team = dfcatch[dfcatch['pff_role'] == 'Coverage'].iloc[0]['team']
    
    df_offensive = dfcatch[dfcatch['pff_role'] == 'Pass Route']
    df_defensive = dfcatch[dfcatch['pff_role'].isin(['Coverage', 'Pass Rush'])]
    
    off_coord = df_offensive[['x','y']].values
    def_coord = df_defensive[['x','y']].values
    
    defender_to_ball = closest_node(ball_point, def_coord)
    defender_to_ball_dist = calculateDistance(ball_point[0], ball_point[1], defender_to_ball[0], defender_to_ball[1])
    
    for coord in range(off_coord.shape[0]):
        off_player_coordinates = (off_coord[coord][0], off_coord[coord][1])
        closest_player_coordinates = closest_node(off_player_coordinates, def_coord)
        def_player_coordinates = df_defensive[(df_defensive['x'] == closest_player_coordinates[0]) & (df_defensive['y'] == closest_player_coordinates[1])]
        corner_dist = calculateDistance(off_coord[coord][0], off_coord[coord][1], def_player_coordinates['x'].values[0], def_player_coordinates['y'].values[0])
        index = dfcatch.index[(dfcatch['x'] == off_coord[coord][0]) & (dfcatch['y'] == off_coord[coord][1]) & (dfcatch['pff_role'] == 'Pass Route')].tolist()[0]
        ball_dist = calculateDistance(off_coord[coord][0], off_coord[coord][1], ball_point[0], ball_point[1])
        dfcatch.at[index, 'separation_from_defender'] = corner_dist
        dfcatch.at[index, 'dist_from_ball'] = ball_dist
        dfcatch.at[index, 'defender_from_ball'] = defender_to_ball_dist

    
    return dfcatch

In [35]:
df_copy = df_main.iloc[:0,:].copy()
df_copy

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,pff_hurryAllowed,pff_sackAllowed,pff_nflIdBlockedPlayer,pff_blockType,pff_backFieldBlock,frameId,time,team,x,y,s,a,dis,o,dir,event,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,penaltyYards,prePenaltyPlayResult,playResult,foulName1,foulNFLId1,foulName2,foulNFLId2,foulName3,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType


In [36]:
df_new = df_main[df_main['event'].isin(['ball_snap'])]
df_new['separation_from_defender'] = np.nan
df_new['dist_from_ball'] = np.nan
df_new['defender_from_ball'] = np.nan
df_copy = df_new.iloc[:0,:].copy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['separation_from_defender'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['dist_from_ball'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['defender_from_ball'] = np.nan


In [37]:
for game in df_new['gameId'].unique():
    df_g = df_new[df_new['gameId'] == game]
    for play in df_g['playId'].unique():
        df_p = df_g[df_g['playId'] == play]
        new_df = addCatchSeparation(df_p)
        new_df = new_df.reset_index(drop=True)
        df_copy = pd.concat([df_copy,new_df[~new_df['separation_from_defender'].isna()]])
df_copy

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,pff_hurryAllowed,pff_sackAllowed,pff_nflIdBlockedPlayer,pff_blockType,pff_backFieldBlock,frameId,time,team,x,y,s,a,dis,o,dir,event,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,penaltyYards,prePenaltyPlayResult,playResult,foulName1,foulNFLId1,foulName2,foulNFLId2,foulName3,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType,separation_from_defender,dist_from_ball,defender_from_ball
1,2021090900,97,35481.0,Pass Route,TE-L,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.65,29.34,1.20,1.78,0.10,78.97,63.46,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,3.030017,5.420747,2.188995
2,2021090900,97,35634.0,Pass Route,LWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.04,36.77,0.48,2.61,0.04,113.75,54.53,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,2.706917,12.860517,2.188995
3,2021090900,97,39985.0,Pass Route,HB-R,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,37.55,22.27,0.04,0.04,0.01,91.23,73.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,6.139422,4.336196,2.188995
5,2021090900,97,41233.0,Pass Route,RWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,42.01,16.20,0.00,0.00,0.00,129.28,4.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,1.377679,7.733104,2.188995
12,2021090900,97,44896.0,Pass Route,SLWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,40.16,35.09,2.98,1.93,0.31,120.74,169.40,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,4.278247,11.257393,2.188995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,2021110100,4433,47954.0,Pass Route,RWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.93,12.18,0.01,0.28,0.01,86.17,71.84,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,6.141010,11.601246,2.455219
38,2021110100,4433,52573.0,Pass Route,SLoWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.83,37.12,0.00,0.00,0.01,85.74,87.73,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,7.460670,13.342732,2.455219
39,2021110100,4433,52573.0,Pass Route,SLoWR,,,,,,,,,,,7,2021-11-02T03:20:21.800,NYG,28.84,37.13,0.00,0.00,0.01,85.74,78.07,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,7.468400,13.352532,2.455219
42,2021110100,4433,53449.0,Pass Route,LWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,27.93,41.92,0.00,0.00,0.01,90.88,12.31,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,11.243496,18.177692,2.455219


In [38]:
df_copy2 = df_copy.drop_duplicates(subset=['gameId', 'playId', 'nflId'], keep='first')
df_copy2

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,pff_hurryAllowed,pff_sackAllowed,pff_nflIdBlockedPlayer,pff_blockType,pff_backFieldBlock,frameId,time,team,x,y,s,a,dis,o,dir,event,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,penaltyYards,prePenaltyPlayResult,playResult,foulName1,foulNFLId1,foulName2,foulNFLId2,foulName3,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType,separation_from_defender,dist_from_ball,defender_from_ball
1,2021090900,97,35481.0,Pass Route,TE-L,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.65,29.34,1.20,1.78,0.10,78.97,63.46,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,3.030017,5.420747,2.188995
2,2021090900,97,35634.0,Pass Route,LWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.04,36.77,0.48,2.61,0.04,113.75,54.53,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,2.706917,12.860517,2.188995
3,2021090900,97,39985.0,Pass Route,HB-R,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,37.55,22.27,0.04,0.04,0.01,91.23,73.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,6.139422,4.336196,2.188995
5,2021090900,97,41233.0,Pass Route,RWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,42.01,16.20,0.00,0.00,0.00,129.28,4.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,1.377679,7.733104,2.188995
12,2021090900,97,44896.0,Pass Route,SLWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,40.16,35.09,2.98,1.93,0.31,120.74,169.40,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,4.278247,11.257393,2.188995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2021110100,4433,43425.0,Pass Route,HB-R,,,,0.0,0.0,0.0,0.0,43326.0,CH,1.0,6,2021-11-02T03:20:21.700,NYG,24.52,21.06,0.02,0.02,0.01,92.03,82.62,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,5.675077,5.326800,2.455219
16,2021110100,4433,44835.0,Pass Route,SLiWR,,,,0.0,0.0,0.0,0.0,42406.0,CH,0.0,6,2021-11-02T03:20:21.700,NYG,27.53,29.29,0.01,0.08,0.00,96.89,151.12,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,2.824606,5.729311,2.455219
30,2021110100,4433,47954.0,Pass Route,RWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.93,12.18,0.01,0.28,0.01,86.17,71.84,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,6.141010,11.601246,2.455219
38,2021110100,4433,52573.0,Pass Route,SLoWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.83,37.12,0.00,0.00,0.01,85.74,87.73,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,7.460670,13.342732,2.455219


In [39]:
for index, row in df_copy2.iterrows():
    df = df_main[df_main['gameId'] == row['gameId']]
    dfplay = df[df['playId'] == row['playId']]
    xdis = abs(dfplay[(dfplay['nflId'] == row['nflId']) & (dfplay['frameId'] == dfplay['frameId'].min())]['x'].values[0] - dfplay[(dfplay['nflId'] == row['nflId']) & (dfplay['frameId'] == row['frameId'])]['x'].values[0])
    if xdis <= 10:
        xroute = 'short'
    elif xdis > 10 and xdis <= 30:
        xroute = 'medium'
    elif xdis > 30:
        xroute = 'deep'
    yi = dfplay[(dfplay['nflId'] == row['nflId']) & (dfplay['frameId'] == dfplay['frameId'].min())]['y'].values[0]
    yf = dfplay[(dfplay['nflId'] == row['nflId']) & ((dfplay['frameId'] == row['frameId']))]['y'].values[0]
    if yi <= 26.5:
        if yf < yi:
            yroute = 'out'
        elif yf > yi:
            yroute = 'in'
        else:
            yroute = 'fly'
    if yi > 26.5:
        if yf < yi:
            yroute = 'in'
        elif yf > yi:
            yroute = 'out'
        else:
            yroute = 'fly'

    # More descriptive route name
    if xroute == 'short' and yroute == 'out':
        yroute = 'flat'
    elif xroute == 'short' and yroute == 'in':
        yroute = 'slant'
    elif xroute == 'deep' and yroute == 'in':
        yroute = 'post'
    elif xroute == 'deep' and yroute == 'out':
        yroute = 'corner'

    if abs(yf - yi) < 3 and xroute == 'short':
        yroute = 'stop'

#     print(row['nflId'], " ran a ", xroute, " ", yroute, " route!")

    df_copy2.loc[index,'xcatchingReceiverRoute'] = xroute
    df_copy2.loc[index,'ycatchingReceiverRoute'] = yroute

df_copy2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy2.loc[index,'xcatchingReceiverRoute'] = xroute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy2.loc[index,'ycatchingReceiverRoute'] = yroute


Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,pff_hit,pff_hurry,pff_sack,pff_beatenByDefender,pff_hitAllowed,pff_hurryAllowed,pff_sackAllowed,pff_nflIdBlockedPlayer,pff_blockType,pff_backFieldBlock,frameId,time,team,x,y,s,a,dis,o,dir,event,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,penaltyYards,prePenaltyPlayResult,playResult,foulName1,foulNFLId1,foulName2,foulNFLId2,foulName3,foulNFLId3,absoluteYardlineNumber,offenseFormation,personnelO,defendersInBox,personnelD,dropBackType,pff_playAction,pff_passCoverage,pff_passCoverageType,separation_from_defender,dist_from_ball,defender_from_ball,xcatchingReceiverRoute,ycatchingReceiverRoute
1,2021090900,97,35481.0,Pass Route,TE-L,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.65,29.34,1.20,1.78,0.10,78.97,63.46,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,3.030017,5.420747,2.188995,short,stop
2,2021090900,97,35634.0,Pass Route,LWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,41.04,36.77,0.48,2.61,0.04,113.75,54.53,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,2.706917,12.860517,2.188995,short,stop
3,2021090900,97,39985.0,Pass Route,HB-R,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,37.55,22.27,0.04,0.04,0.01,91.23,73.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,6.139422,4.336196,2.188995,short,stop
5,2021090900,97,41233.0,Pass Route,RWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,42.01,16.20,0.00,0.00,0.00,129.28,4.76,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,1.377679,7.733104,2.188995,short,stop
12,2021090900,97,44896.0,Pass Route,SLWR,,,,,,,,,,,6,2021-09-10T00:26:31.600,TB,40.16,35.09,2.98,1.93,0.31,120.74,169.40,ball_snap,(13:33) (Shotgun) T.Brady pass incomplete deep...,1,3,2,TB,DAL,TB,33,13:33,0,0,I,0.0,0,0,,,,,,,43.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 2 LB, 5 DB",TRADITIONAL,0,Cover-1,Man,4.278247,11.257393,2.188995,short,stop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2021110100,4433,43425.0,Pass Route,HB-R,,,,0.0,0.0,0.0,0.0,43326.0,CH,1.0,6,2021-11-02T03:20:21.700,NYG,24.52,21.06,0.02,0.02,0.01,92.03,82.62,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,5.675077,5.326800,2.455219,short,stop
16,2021110100,4433,44835.0,Pass Route,SLiWR,,,,0.0,0.0,0.0,0.0,42406.0,CH,0.0,6,2021-11-02T03:20:21.700,NYG,27.53,29.29,0.01,0.08,0.00,96.89,151.12,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,2.824606,5.729311,2.455219,short,stop
30,2021110100,4433,47954.0,Pass Route,RWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.93,12.18,0.01,0.28,0.01,86.17,71.84,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,6.141010,11.601246,2.455219,short,stop
38,2021110100,4433,52573.0,Pass Route,SLoWR,,,,,,,,,,,6,2021-11-02T03:20:21.700,NYG,28.83,37.12,0.00,0.00,0.01,85.74,87.73,ball_snap,(:35) (Shotgun) D.Jones sacked at NYG 14 for -...,4,4,15,NYG,KC,NYG,20,00:35,20,17,S,0.0,-5,-5,,,,,,,30.0,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"4 DL, 1 LB, 6 DB",TRADITIONAL,0,Quarters,Zone,7.460670,13.342732,2.455219,short,stop


In [40]:
df_copy3 = df_copy2[['gameId', 'playId', 'nflId', 'pff_role', 'pff_positionLinedUp', 'frameId', 'x','y','s','a','dis','o','dir', 'yardsToGo', 'passResult', 'prePenaltyPlayResult', 'offenseFormation', 'defendersInBox', 'dropBackType', 'pff_passCoverage', 'pff_passCoverageType', 'separation_from_defender','defender_from_ball' ,'dist_from_ball', 'xcatchingReceiverRoute', 'ycatchingReceiverRoute']]
df_copy3

Unnamed: 0,gameId,playId,nflId,pff_role,pff_positionLinedUp,frameId,x,y,s,a,dis,o,dir,yardsToGo,passResult,prePenaltyPlayResult,offenseFormation,defendersInBox,dropBackType,pff_passCoverage,pff_passCoverageType,separation_from_defender,defender_from_ball,dist_from_ball,xcatchingReceiverRoute,ycatchingReceiverRoute
1,2021090900,97,35481.0,Pass Route,TE-L,6,41.65,29.34,1.20,1.78,0.10,78.97,63.46,2,I,0,SHOTGUN,6.0,TRADITIONAL,Cover-1,Man,3.030017,2.188995,5.420747,short,stop
2,2021090900,97,35634.0,Pass Route,LWR,6,41.04,36.77,0.48,2.61,0.04,113.75,54.53,2,I,0,SHOTGUN,6.0,TRADITIONAL,Cover-1,Man,2.706917,2.188995,12.860517,short,stop
3,2021090900,97,39985.0,Pass Route,HB-R,6,37.55,22.27,0.04,0.04,0.01,91.23,73.76,2,I,0,SHOTGUN,6.0,TRADITIONAL,Cover-1,Man,6.139422,2.188995,4.336196,short,stop
5,2021090900,97,41233.0,Pass Route,RWR,6,42.01,16.20,0.00,0.00,0.00,129.28,4.76,2,I,0,SHOTGUN,6.0,TRADITIONAL,Cover-1,Man,1.377679,2.188995,7.733104,short,stop
12,2021090900,97,44896.0,Pass Route,SLWR,6,40.16,35.09,2.98,1.93,0.31,120.74,169.40,2,I,0,SHOTGUN,6.0,TRADITIONAL,Cover-1,Man,4.278247,2.188995,11.257393,short,stop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2021110100,4433,43425.0,Pass Route,HB-R,6,24.52,21.06,0.02,0.02,0.01,92.03,82.62,15,S,-5,SHOTGUN,6.0,TRADITIONAL,Quarters,Zone,5.675077,2.455219,5.326800,short,stop
16,2021110100,4433,44835.0,Pass Route,SLiWR,6,27.53,29.29,0.01,0.08,0.00,96.89,151.12,15,S,-5,SHOTGUN,6.0,TRADITIONAL,Quarters,Zone,2.824606,2.455219,5.729311,short,stop
30,2021110100,4433,47954.0,Pass Route,RWR,6,28.93,12.18,0.01,0.28,0.01,86.17,71.84,15,S,-5,SHOTGUN,6.0,TRADITIONAL,Quarters,Zone,6.141010,2.455219,11.601246,short,stop
38,2021110100,4433,52573.0,Pass Route,SLoWR,6,28.83,37.12,0.00,0.00,0.01,85.74,87.73,15,S,-5,SHOTGUN,6.0,TRADITIONAL,Quarters,Zone,7.460670,2.455219,13.342732,short,stop


In [41]:
df_copy3.to_csv('Data/ball_snap_data', index=False)

This data set is now ready for the exploratory data analysis phase. I have dealt with missing values, consolidated the required for my analysis, and checked it's quality.

In [42]:
# df_main.to_csv('Data/cleaned_data', index=False)

In [43]:
# df_copy3.to_csv('Data/new_metrics', index=False)