# Data preprocessing for clustering pre-snap movements

## Importing the libraries

Here we import all the necessary libraries

In [1]:
import pandas as pd

## Loading the data

First, we need to load all the necessary dataset

In [2]:
games = pd.read_csv('data/games.csv') # Game data: contains the teams playing in each game

In [3]:
plays = pd.read_csv('data/plays.csv') # Play data: contains play-level information from each game

In [4]:
players = pd.read_csv('data/players.csv') # Player data: contains player-level information from players that participated in any of the tracking data files

In [5]:
player_play = pd.read_csv('data/player_play.csv') # Player play data: contains player-level stats for each game and play

In [6]:
# Tracking data: contains player tracking data from nine weeks.
track1 = pd.read_csv('data/tracking_week_1.csv')
track2 = pd.read_csv('data/tracking_week_2.csv')
track3 = pd.read_csv('data/tracking_week_3.csv')
track4 = pd.read_csv('data/tracking_week_4.csv')
track5 = pd.read_csv('data/tracking_week_5.csv')
track6 = pd.read_csv('data/tracking_week_6.csv')
track7 = pd.read_csv('data/tracking_week_7.csv')
track8 = pd.read_csv('data/tracking_week_8.csv')
track9 = pd.read_csv('data/tracking_week_9.csv')

Let's also combine all the tracking data into one file.

In [7]:
track = pd.concat([track1, track2, track3, track4, track5, track6, track7, track8, track9], ignore_index=True)

## Adjusting play direction

Since we want our data to be consistent, we will adjust play direction, so that all the plays are directed left to right.

In [8]:
FIELD_LENGTH = 120.0
FIELD_WIDTH = 53.3

track_adjusted = track

left_plays = track['playDirection'] == 'left'

# Flip coordinates to the other side of the field
track_adjusted.loc[left_plays, 'x'] = FIELD_LENGTH - track_adjusted.loc[left_plays, 'x']
track_adjusted.loc[left_plays, 'y'] = FIELD_WIDTH - track_adjusted.loc[left_plays, 'y']

# Adjust the directions
track_adjusted.loc[left_plays, 'o'] = (360 - track_adjusted.loc[left_plays, 'o']) % 360
track_adjusted.loc[left_plays, 'dir'] = (360 - track_adjusted.loc[left_plays, 'dir']) % 360
    
track_adjusted.loc[left_plays, 'playDirection'] = 'right'

track_adjusted.drop(columns=['playDirection'], inplace=True)

## Leaving possession team only

Since we are interestied only in the offense team pre-snap movemetns, let's remove all the rows that correspond to the defence team from the dataset.

In [10]:
# Merge 'track_adjusted' with 'plays' to get 'possessionTeam'
track_with_possession_team = pd.merge(
    track_adjusted,
    plays[['gameId', 'playId', 'possessionTeam']],
    on=['gameId', 'playId'],
    how='left'
)

In [11]:
# Leave only possession team and football
mask = (
    (track_with_possession_team['club'] == track_with_possession_team['possessionTeam']) | 
    (track_with_possession_team['club'].str.lower() == 'football')
)

track_possession_team_only = track_with_possession_team[mask].copy()

In [12]:
track_possession_team_only = track_possession_team_only.drop(columns=['possessionTeam'])

## Leaving only actions before snap

We are going to work with player movements before snap, so let's leave only corresponding rows in the data.

In [14]:
track_before = track_possession_team_only[track_possession_team_only['frameType'] == 'BEFORE_SNAP']

## Adjusting the coordinates

We've already added some consistency to the dataset, but ensuring all the plays are displayed in the same direction. But similar plays can still differ a lot in the coordinates. To take this into account, we will adjust the coordinates based on the ball position.

Since the ball essentially is not moving before snap, we will take the coordinates of final ball position before snap and adjust the coordinates of all the players in this play to the ball postion. So at the end, the football will have coordinates (0,0). If the player is one yard left of the ball, he will have coordinates (-1, 0) and so on.

So first, let's get the position of the football.

In [16]:
ball_presnap = track_before[track_before['nflId'].isna()] # Only ball coordinates

In [17]:
ball_final = ball_presnap.groupby(['gameId', 'playId']).tail(1).reset_index(drop=True) # Final position of the ball before snap

In [18]:
ball_final = ball_final[['gameId', 'playId', 'x', 'y']] # Leaving only necessary columns

In [20]:
ball_position = ball_final[['gameId', 'playId', 'x', 'y']].copy()
ball_position.rename(columns={'x': 'ball_x', 'y': 'ball_y'}, inplace=True) # Renaming the columns for future use

And now, let's adjust the coordinates.

In [21]:
track_ball_adjusted = pd.merge(
    track_before,
    ball_position,
    on=['gameId', 'playId'],
    how='left'
)
track_ball_adjusted['x_adjusted'] = track_ball_adjusted['x'] - track_ball_adjusted['ball_x']
track_ball_adjusted['y_adjusted'] = track_ball_adjusted['y'] - track_ball_adjusted['ball_y']

In [22]:
# Rename adjusted columns
track_ball_adjusted['x'] = track_ball_adjusted['x_adjusted']
track_ball_adjusted['y'] = track_ball_adjusted['y_adjusted']

# Drop unnecessary columns
track_ball_adjusted.drop(['x_adjusted', 'y_adjusted', 'ball_x', 'ball_y'], axis=1, inplace=True)

Let's just confirm the data looks like we want now.

In [23]:
track_ball_adjusted.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,x,y,s,a,dis,o,dir,event
0,2022091200,64,39987.0,Geno Smith,1,BEFORE_SNAP,2022-09-13 00:16:03.5,7.0,SEA,-4.949999,-0.7,0.31,0.31,0.03,44.04,17.42,huddle_break_offense
1,2022091200,64,39987.0,Geno Smith,2,BEFORE_SNAP,2022-09-13 00:16:03.6,7.0,SEA,-4.939999,-0.66,0.38,0.41,0.04,56.72,19.89,
2,2022091200,64,39987.0,Geno Smith,3,BEFORE_SNAP,2022-09-13 00:16:03.7,7.0,SEA,-4.929999,-0.62,0.44,0.41,0.04,71.65,19.91,
3,2022091200,64,39987.0,Geno Smith,4,BEFORE_SNAP,2022-09-13 00:16:03.8,7.0,SEA,-4.909999,-0.58,0.53,0.54,0.05,83.46,24.17,
4,2022091200,64,39987.0,Geno Smith,5,BEFORE_SNAP,2022-09-13 00:16:03.9,7.0,SEA,-4.879999,-0.53,0.56,0.47,0.05,91.62,27.66,


## Removing data before line set

We will keep only the entries after line set, since everything before it doesn't really has any significance.

In [24]:
# Get the time of line set
line_set_times = (
    track_ball_adjusted[track_ball_adjusted['event'] == 'line_set']
    .groupby(['gameId', 'playId'])['time']
    .min()
    .reset_index(name='line_set_time')
)

In [26]:
track_after_line_set = pd.merge(
    track_ball_adjusted, 
    line_set_times, 
    on=['gameId', 'playId'], 
    how='left'
)

track_after_line_set = track_after_line_set[track_after_line_set['time'] >= track_after_line_set['line_set_time']].copy()

In [28]:
track_after_line_set.drop(['line_set_time'], axis=1, inplace=True)

## Reducing the number of frames

Often players don't start to move right after the line set. So we will keep only the last 50 frmaes (which corresponds to 5 seconds) before the ball snap. This is the reasonable number to still relfect the trajectory of movemnt, at the same time reducing the dataset size. Keeping the number of frames consistent for each situation will also help us when we will go on training the model.

In [30]:
# Convert 'time' column to  datetime format
track_after_line_set['time'] = pd.to_datetime(track_after_line_set['time'], errors='coerce')

In [32]:
# Sort frames by time
track_after_line_set_sorted = track_after_line_set.sort_values(by=['gameId', 'playId', 'nflId', 'time'], ascending=True)

In [34]:
track_after_line_set_grouped = track_after_line_set_sorted.groupby(['gameId', 'playId', 'nflId'])
track_after_line_set_50_frames = track_after_line_set_grouped.tail(50).reset_index(drop=True) # LEave only last 50 frames

## Detecting pre-snap movements

We are interested only in players that moved before ball snap. To be more specific, we are interested at players' motions, and to be even more specific – at motions that don't stop before snap. Luckily, in the 'player_play' dataset we have a column 'inMotionAtBallSnap' that has a boolean value, corresponding to wether the player was at motion at the moment of ball snap. We will use it to leave only such players in our dataset.

In [35]:
track_with_motion_column = pd.merge(
    track_after_line_set_50_frames,
    player_play[['gameId', 'playId', 'nflId', 'inMotionAtBallSnap']],
    on=['gameId', 'playId', 'nflId'],
    how='left'
)

In [39]:
track_movers = track_with_motion_column[track_with_motion_column['inMotionAtBallSnap'] == True].copy() # LEave only players in motion

In [40]:
track_movers.drop(['inMotionAtBallSnap'], axis=1, inplace=True) # Drop unnecessary column

## A bit more preprocessing

Let's look at the number of moving players at each play.

In [49]:
# Group by 'gameId' and 'playId' and count unique 'nflId'
movers_count = track_movers.groupby(['gameId', 'playId'])['nflId'].nunique().reset_index()

In [52]:
# Create a frequency table for 'count' values and rename the columns
movers_frequency = movers_count['nflId'].value_counts().sort_index().reset_index()
movers_frequency.columns = ['Number of Movers', 'Number of Plays']

In [53]:
# Display the results
print(movers_frequency)

   Number of Movers  Number of Plays
0                 1             4372
1                 2               60
2                 3                5
3                 4                2
4                 5                4


We can see that most plays have only one mover at the ball snap. Multiple movers (especially high values) may be errors in the data, so for consistency let's just keep the plays with one mover.

In [55]:
# Rename column
movers_count = movers_count.rename(columns={'nflId': 'count'})

# Filter movers_count to keep only rows where 'count' equals 1
movers_count_1 = movers_count[movers_count['count'] == 1].copy()

# Create a unique identifier for filtering (gameId, playId)
movers_count_1['key'] = movers_count_1['gameId'].astype(str) + '_' + movers_count_1['playId'].astype(str)
track_movers['key'] = track_movers['gameId'].astype(str) + '_' + track_movers['playId'].astype(str)

# Filter the movers dataset to keep only rows matching the movers_count_1
track_movers_1 = track_movers[track_movers['key'].isin(movers_count_1['key'])].copy()

# Drop the 'key' column
track_movers_1.drop(columns=['key'], inplace=True)

## Dropping unnecessary columns

Finally, let's look at the features we still have in the dataset.

In [56]:
track_movers_1.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,x,y,s,a,dis,o,dir,event
850,2022090800,80,47857.0,Devin Singletary,40,BEFORE_SNAP,2022-09-09 00:24:28.400,26.0,BUF,-2.399998,-9.88,0.0,0.0,0.0,92.95,83.97,
851,2022090800,80,47857.0,Devin Singletary,41,BEFORE_SNAP,2022-09-09 00:24:28.500,26.0,BUF,-2.399998,-9.88,0.0,0.0,0.0,92.95,91.04,
852,2022090800,80,47857.0,Devin Singletary,42,BEFORE_SNAP,2022-09-09 00:24:28.600,26.0,BUF,-2.399998,-9.88,0.0,0.09,0.0,93.56,130.42,
853,2022090800,80,47857.0,Devin Singletary,43,BEFORE_SNAP,2022-09-09 00:24:28.700,26.0,BUF,-2.389998,-9.88,0.02,0.17,0.0,93.56,123.26,
854,2022090800,80,47857.0,Devin Singletary,44,BEFORE_SNAP,2022-09-09 00:24:28.800,26.0,BUF,-2.389998,-9.87,0.05,0.27,0.0,93.56,127.13,


We will not need 'frameType' and 'event' columns anymore, so let's drop them.

In [57]:
track_movers_1.drop(columns=['frameType', 'event'], inplace=True)

## Saving the dataset

After all the preprocessing is done let's save our data.

In [59]:
track_movers_1.to_csv('data/track_preprocessed.csv', index=False)