In [1]:
import os, cv2
from tqdm import tqdm 
import pandas as pd 
from sklearn.model_selection import GroupKFold
import numpy as np 


## Functions

In [2]:
def compute_distance(df_l, tr_tracking, merge_col="datetime", use_cols=["x_position", "y_position"]):
    """
    Merges tracking data on player1 and 2 and computes the distance.
    """
    # df_l.columns => ['contact_id', 'game_play', 'datetime', 'step', 'nfl_player_id_1','nfl_player_id_2', 'contact']
    # tr_tracking.columns =>
    # ['game_play', 'game_key', 'play_id', 'nfl_player_id', 'datetime', 'step','team', 'position', 'jersey_number',
    # 'x_position', 'y_position','speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
    # use_cols => 
    # ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
    
    df_combo = df_l.astype({"nfl_player_id_1": "str"}).merge(
            tr_tracking.astype({"nfl_player_id": "str"})[["game_play", merge_col, "nfl_player_id",] + use_cols],
            left_on=["game_play", merge_col, "nfl_player_id_1"],# left dataframe columns
            right_on=["game_play", merge_col, "nfl_player_id"],# right dataframe columns
            how="left",
        ).rename(columns={c: c+"_1" for c in use_cols}).drop("nfl_player_id", axis=1)
        
    # df_combo.columns => ['contact_id', 'game_play', 'datetime', 'step', 'nfl_player_id_1', 'nfl_player_id_2', 'contact', 
    # 'x_position_1', 'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1', 'acceleration_1', 'sa_1']
    
    df_combo = df_combo.merge(
            tr_tracking.astype({"nfl_player_id": "str"})[["game_play", merge_col, "nfl_player_id"] +  use_cols],
            left_on=["game_play", merge_col, "nfl_player_id_2"],# left dataframe columns
            right_on=["game_play", merge_col, "nfl_player_id"],# right dataframe columns
            how="left",).drop("nfl_player_id", axis=1).rename(columns={c: c+"_2" for c in use_cols})#.copy()

    # df_combo.columns => ['contact_id', 'game_play', 'datetime', 'step', 'nfl_player_id_1', 'nfl_player_id_2', 'contact', 
    # 'x_position_1', 'y_position_1', 'speed_1', 'distance_1', 'direction_1', 'orientation_1', 'acceleration_1', 'sa_1', 
    # 'x_position_2', 'y_position_2', 'speed_2', 'distance_2', 'direction_2', 'orientation_2', 'acceleration_2', 'sa_2'] 
    
    
    df_combo["distance"] = np.sqrt(
        np.square(df_combo["x_position_1"] - df_combo["x_position_2"])
        + np.square(df_combo["y_position_1"] - df_combo["y_position_2"])
    )
    return df_combo

## Operation

In [3]:
df_l = pd.read_csv('data/train_labels.csv')
df_l.head(3)

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact
0,58168_003392_0_38590_43854,58168_003392,2020-09-11T03:01:48.100Z,0,38590,43854,0
1,58168_003392_0_38590_41257,58168_003392,2020-09-11T03:01:48.100Z,0,38590,41257,0
2,58168_003392_0_38590_41944,58168_003392,2020-09-11T03:01:48.100Z,0,38590,41944,0


In [6]:
df_l[df_l.step==-172].head()

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact


In [7]:
df_l[df_l.step==0].head()

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact
0,58168_003392_0_38590_43854,58168_003392,2020-09-11T03:01:48.100Z,0,38590,43854,0
1,58168_003392_0_38590_41257,58168_003392,2020-09-11T03:01:48.100Z,0,38590,41257,0
2,58168_003392_0_38590_41944,58168_003392,2020-09-11T03:01:48.100Z,0,38590,41944,0
3,58168_003392_0_38590_42386,58168_003392,2020-09-11T03:01:48.100Z,0,38590,42386,0
4,58168_003392_0_38590_47944,58168_003392,2020-09-11T03:01:48.100Z,0,38590,47944,0


In [8]:
tr_tracking = pd.read_csv("data/train_player_tracking.csv")
tr_tracking.head(3)

Unnamed: 0,game_play,game_key,play_id,nfl_player_id,datetime,step,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58580_001136,58580,1136,44830,2021-10-10T21:08:20.900Z,-108,away,CB,22,61.59,42.6,1.11,0.11,320.33,263.93,0.71,-0.64
1,58580_001136,58580,1136,47800,2021-10-10T21:08:20.900Z,-108,away,DE,97,59.48,26.81,0.23,0.01,346.84,247.16,1.29,0.9
2,58580_001136,58580,1136,52444,2021-10-10T21:08:20.900Z,-108,away,FS,29,72.19,31.46,0.61,0.06,11.77,247.69,0.63,-0.33


In [9]:
use_cols = ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
df_l = compute_distance(df_l, tr_tracking, use_cols=use_cols)

In [10]:
df_m = pd.read_csv('data/train_video_metadata.csv')
df_m = df_m[df_m['view']=='Endzone'][['game_play', 'start_time']]

In [11]:
df_l = df_l.merge(df_m, on=['game_play'])
df_l['datetime'] = pd.to_datetime(df_l["datetime"], utc=True)
df_l['start_time'] = pd.to_datetime(df_l["start_time"], utc=True)
    # df_l['datetime'][0] => Timestamp('2020-09-11 03:01:48.100000+0000', tz='UTC')
    # df_l['datetime'][0].value => 1599793308100000000
    # df_l['start_time'][0] => Timestamp('2020-09-11 03:01:43.134000+0000', tz='UTC')
    # pd.to_timedelta(50, "ms") => Timedelta('0 days 00:00:00.050000')
    # df_l['datetime'][0] - df_l['start_time'][0] => Timedelta('0 days 00:00:04.966000')
    # df_l['datetime'][0] - df_l['start_time'][0] - pd.to_timedelta(50, "ms") => Timedelta('0 days 00:00:04.916000')    
    # (df_l['datetime'][0] - df_l['start_time'][0] - pd.to_timedelta(50, "ms"))*59.94/1000 => Timedelta('0 days 00:00:00.294665040')

In [45]:
df_l['datetime'][0:2]

0   2020-09-11 03:01:48.100000+00:00
1   2020-09-11 03:01:48.100000+00:00
Name: datetime, dtype: datetime64[ns, UTC]

In [12]:
# ['frame']: frame at the contact.
# ['frame'] => "contact time" -  "video start time"- "3 frames"
# adding 3 frames to "video start time" as these frames are 2 or 3 frames more than frames present in "train_baseline_helmets.csv".
df_l['frame'] = (df_l['datetime'] - df_l['start_time'] - pd.to_timedelta(50, "ms")).astype('timedelta64[ms]')*59.94/1000
    # .astype('timedelta64[ms]') => convert to ms
    # .astype('timedelta64[ms]')/1000 => ms/1000 => convert to sec
    # sec * 59.94 (59.94 frames per second, standard frame rate) => 294.66 frames
    # "59.94/1000 = 0.0594 frames per ms" => frames in 50ms = 50*0.0594 = 3 frames

In [13]:
df_l.head(3)

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,...,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance,start_time,frame
0,58168_003392_0_38590_43854,58168_003392,2020-09-11 03:01:48.100000+00:00,0,38590,43854,0,40.33,25.28,0.52,...,16.79,0.74,0.06,263.92,294.74,1.74,1.74,8.650763,2020-09-11 03:01:43.134000+00:00,294.66504
1,58168_003392_0_38590_41257,58168_003392,2020-09-11 03:01:48.100000+00:00,0,38590,41257,0,40.33,25.28,0.52,...,15.59,0.67,0.07,180.38,270.88,0.34,0.08,11.112592,2020-09-11 03:01:43.134000+00:00,294.66504
2,58168_003392_0_38590_41944,58168_003392,2020-09-11 03:01:48.100000+00:00,0,38590,41944,0,40.33,25.28,0.52,...,22.85,0.68,0.08,234.17,282.07,0.81,0.81,2.948525,2020-09-11 03:01:43.134000+00:00,294.66504


In [44]:
df_l['frame'][0:2]

0    294.66504
1    294.66504
Name: frame, dtype: float64

In [74]:
print(df_l.shape)

(4721618, 27)


In [48]:
df_l['game_play'][0:2]

0    58168_003392
1    58168_003392
Name: game_play, dtype: object

In [49]:
df_l['game_key'] = df_l['game_play'].apply(lambda x: x.split('_')[0])

In [52]:
df_l['game_key'][0:2]

0    58168
1    58168
Name: game_key, dtype: object

In [75]:
df_l['fold'] = -1
group_kfold = GroupKFold(n_splits=5)
# create folds in accordance to ['game_key'] values. Many different game keys would be in same fold becuase unique game keys are >5.
for fold_id, (train_index, val_index) in enumerate(group_kfold.split(df_l, df_l, df_l['game_key'].values)):    
    df_l.iloc[val_index, -1] = fold_id
        # -1; pick last column of frame.
        # df_l.iloc[0,-1] => '58168'

In [76]:
df_l = df_l[['contact_id', 'nfl_player_id_1',
       'nfl_player_id_2', 'x_position_1', 'y_position_1', 'speed_1',
       'distance_1', 'direction_1', 'orientation_1', 'acceleration_1', 'sa_1',
       'x_position_2', 'y_position_2', 'speed_2', 'distance_2', 'direction_2',
       'orientation_2', 'acceleration_2', 'sa_2', 'contact', 'frame', 'distance', 'fold']]

In [77]:
# %.3f; float values upto 3 decimal
df_l.to_csv('data/train_folds.csv', index=False, float_format='%.3f')

for i in [0,1,2,3,4]:
    print(df_l[df_l.fold==i].contact.value_counts())

0    933661
1     12053
Name: contact, dtype: int64
0    931485
1     13976
Name: contact, dtype: int64
0    934228
1     11365
Name: contact, dtype: int64
0    932929
1     15821
Name: contact, dtype: int64
0    924793
1     11307
Name: contact, dtype: int64
