In [319]:
import sys
sys.path.append("../")
from settings import RAW_DATA_DIR, INTERIM_DATA_DIR
import pandas as pd
import plotly.express as px
import os
import numpy as np

In [2]:
play_df = pd.read_csv(os.path.join(RAW_DATA_DIR, 'plays.csv'))
games_df = pd.read_csv(os.path.join(RAW_DATA_DIR, 'games.csv'))

In [3]:
week_and_games = games_df[games_df.week == 1]
tracking_df = pd.read_csv(os.path.join(RAW_DATA_DIR, f'week{1}.csv'))

In [4]:
games_n_plays_df = play_df.merge(week_and_games, how='inner', on='gameId')

In [5]:
from src.features.helpers.processing import add_missing_timestamp_values

In [6]:
tracking_df = add_missing_timestamp_values(tracking_df)

Start record processing. Differentiate timestamps that have multiple records...
Time: 20:05:15
End record processing: 20:08:00


In [7]:
df_t = tracking_df.merge(games_n_plays_df, how='left', on=['gameId', 'playId'])

In [8]:
df_t.columns

Index(['time', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event', 'nflId',
       'displayName', 'jerseyNumber', 'position', 'frameId', 'team', 'gameId',
       'playId', 'playDirection', 'route', 'playDescription', 'quarter',
       'down', 'yardsToGo', 'possessionTeam', 'playType', 'yardlineSide',
       'yardlineNumber', 'offenseFormation', 'personnelO', 'defendersInTheBox',
       'numberOfPassRushers', 'personnelD', 'typeDropback',
       'preSnapVisitorScore', 'preSnapHomeScore', 'gameClock',
       'absoluteYardlineNumber', 'penaltyCodes', 'penaltyJerseyNumbers',
       'passResult', 'offensePlayResult', 'playResult', 'epa', 'isDefensivePI',
       'gameDate', 'gameTimeEastern', 'homeTeamAbbr', 'visitorTeamAbbr',
       'week'],
      dtype='object')

In [9]:
df_t.shape

(986022, 49)

In [10]:
gb = df_t.groupby(['gameId', 'playId'])
len(gb)

1034

In [11]:
df_c = df_t.copy()
df_c.shape

(986022, 49)

In [12]:
for name, group in gb:
    game_id, play_id = name
    
    # if group does not contain pass forward, drop it
    if all(group.event != 'pass_forward'):
        df_c = df_c[(df_c.gameId != game_id) | (df_c.playId != play_id)]
df_c.shape

(918456, 49)

## Sort by time, gameId, playId and event

In [74]:
df_s = df_c.sort_values(by=['gameId', 'playId', 'time', 'event'])

In [76]:
df_s = df_s.reset_index(drop=True)

In [77]:
df_s.head(5)

Unnamed: 0,time,x,y,s,a,dis,o,dir,event,nflId,...,passResult,offensePlayResult,playResult,epa,isDefensivePI,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,week
0,2018-09-07T01:07:14.599Z,91.73,26.67,0.0,0.01,0.02,289.57,240.93,,310.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
1,2018-09-07T01:07:14.599Z,88.89,36.47,0.01,0.01,0.01,105.63,66.66,,79848.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
2,2018-09-07T01:07:14.599Z,91.35,44.16,0.02,0.03,0.01,290.45,16.86,,2495454.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
3,2018-09-07T01:07:14.599Z,86.31,22.01,0.09,0.42,0.01,70.12,168.91,,2495613.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
4,2018-09-07T01:07:14.599Z,90.78,36.15,0.0,0.0,0.0,257.61,193.97,,2533040.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1


# Find 2 closest players to the ball when event changes from None to something

In [78]:
df_s[df_s.event == 'pass_forward'].shape

(14166, 49)

## Remove all values before pass forward

In [323]:
gb_2 = df_s.groupby(['gameId', 'playId'])

In [324]:
df_d = df_s.copy()
df_d.shape

(918456, 49)

In [325]:
pd.set_option('display.max_rows', None)

In [None]:
i = 0
groups = len(gb_2)
for name, group in gb_2:
    game_id, play_id = name
    pass_forward_frame_id = group[group.event == 'pass_forward'].index.min() - 1
    remove_start = group.index.min()
    
    print(f'{i}/{groups} from {remove_start} to {pass_forward_frame_id}', end='\r')
    df_d = df_d.drop(df_d.loc[remove_start:pass_forward_frame_id].index)
    i +=1

416/962 from 396642 to 397091

In [84]:
df_d.shape

(385372, 49)

In [85]:
df_d[df_d.event == 'pass_forward'].shape

(14166, 49)

## Which events are possible after pass forward?

In [86]:
df_d.event.value_counts()

None                         329940
pass_forward                  14166
pass_arrived                  11410
pass_outcome_caught            8133
tackle                         6567
first_contact                  6457
pass_outcome_incomplete        5291
out_of_bounds                  1728
pass_outcome_interception       485
touchdown                       352
pass_tipped                     280
pass_outcome_touchdown          214
fumble                          143
fumble_defense_recovered        100
fumble_offense_recovered         44
penalty_flag                     32
touchback                        30
Name: event, dtype: int64

In [89]:
df_next = df_d.copy()
df_next.shape

(385372, 49)

In [91]:
df_next = df_next[df_next.event != 'None']
df_next.shape

(55432, 49)

In [92]:
gb_3 = df_next.groupby(['gameId', 'playId'])

## Find 'situation' dataframe

In [97]:
gb_4 = df_d.groupby(['gameId', 'playId'])

In [98]:
df_s2 = df_d.copy()
df_s2.shape

(385372, 49)

In [112]:
df_s2.columns

Index(['time', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event', 'nflId',
       'displayName', 'jerseyNumber', 'position', 'frameId', 'team', 'gameId',
       'playId', 'playDirection', 'route', 'playDescription', 'quarter',
       'down', 'yardsToGo', 'possessionTeam', 'playType', 'yardlineSide',
       'yardlineNumber', 'offenseFormation', 'personnelO', 'defendersInTheBox',
       'numberOfPassRushers', 'personnelD', 'typeDropback',
       'preSnapVisitorScore', 'preSnapHomeScore', 'gameClock',
       'absoluteYardlineNumber', 'penaltyCodes', 'penaltyJerseyNumbers',
       'passResult', 'offensePlayResult', 'playResult', 'epa', 'isDefensivePI',
       'gameDate', 'gameTimeEastern', 'homeTeamAbbr', 'visitorTeamAbbr',
       'week'],
      dtype='object')

In [219]:
def get_closest_players(situation_df: pd.DataFrame, ball_x: int, ball_y: int) -> tuple:
    home_df = situation_df[situation_df.team == 'home'].copy()
    away_df = situation_df[situation_df.team == 'away'].copy()
    home_df.loc[:, 'distance'] = np.sqrt(np.square(home_df.x - ball_x) + np.square(home_df.y - ball_y))
    away_df.loc[:, 'distance'] = np.sqrt(np.square(away_df.x - ball_x) + np.square(away_df.y - ball_y))
    
    # TODO: trebalo bi vidjeti tko ima loptu i onda najbliži protivnički igrač da se računa od njega
    p1_p = home_df.distance.min()
    p2_p = away_df.distance.min()
    p1 = home_df[home_df.distance == p1_p]
    p2 = away_df[away_df.distance == p2_p]
    return (p1, p2)

In [220]:
def get_players_and_ball_indices(df: pd.DataFrame, p1: pd.Series, p2: pd.Series) -> list:
    try:
        indices = df[(df.team == 'football') | (df.nflId == p1.nflId.item()) | (df.nflId == p2.nflId.item())].index
    except ValueError:
        import pdb; pdb.set_trace()
    return indices.tolist()

In [322]:
i = 0
pd.options.mode.chained_assignment = None
keep_indices = []
groups = len(gb_4)
for name, group in gb_4:
    game_id, play_id = name
    event_3rd = group.event.unique()[2]

    situation_df = group[group.event == event_3rd]

    # convert dataframe into series
    ball_row = situation_df[situation_df.team == 'football'].head(1)

    # remove ball
    player_situation_df = situation_df[situation_df.team != 'football']
    p1, p2 = get_closest_players(player_situation_df, ball_row.x.item(), ball_row.y.item())
    keep_indices.extend(get_players_and_ball_indices(group, p1, p2))
    
    
    i +=1
    print(f'{i}/{groups}', end='\r')

962/962

In [226]:
len(keep_indices)

78434

In [262]:
clean_df = df_s2[df_s2.index.isin(keep_indices)]

In [264]:
clean_df.shape

(78434, 49)

In [273]:
clean_df.head(5)

Unnamed: 0,time,x,y,s,a,dis,o,dir,event,nflId,...,passResult,offensePlayResult,playResult,epa,isDefensivePI,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,week
491,2018-09-07T01:07:18.099Z,84.82,38.33,5.53,4.3,0.54,295.34,318.58,pass_forward,79848.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
492,2018-09-07T01:07:18.099Z,80.69,44.91,3.11,6.67,0.34,155.11,297.88,pass_forward,2495454.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
503,2018-09-07T01:07:18.099Z,96.9,26.97,2.05,3.76,0.22,,,pass_forward,,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
505,2018-09-07T01:07:18.200Z,84.46,38.77,5.87,4.26,0.57,306.15,323.06,,79848.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1
506,2018-09-07T01:07:18.200Z,80.46,45.07,2.59,6.7,0.28,145.59,309.96,,2495454.0,...,C,10,10,0.261827,False,09/06/2018,20:20:00,PHI,ATL,1


## Check and plot

In [266]:
def animated_slice(df):
    fig = px.scatter(
        df, 
        x='x', y='y', color='team', text='jerseyNumber', animation_frame='time', animation_group='displayName',
        range_x=[-10, 130], range_y=[-10, 60],
        symbol='event',
        hover_data=['displayName', 'visitorTeamAbbr', 'jerseyNumber', 's', 'a', 'dis', 'o', 'dir', 'playDirection'])
    fig.update_traces(textposition='top center', marker_size=10)
    # , transition = {'duration': 500}
    fig.update_layout(paper_bgcolor='darkgreen', plot_bgcolor='darkgreen', font_color='white')
    
    return fig

In [267]:
gb_5 = clean_df.groupby(['gameId', 'playId'])

In [269]:
clean_df.isDefensivePI.value_counts()

False    77360
True      1074
Name: isDefensivePI, dtype: int64

In [272]:
for i, group_data in enumerate(gb_5):
    name, group = group_data
    game_id, play_id = name
        
    is_dpi = (group.isDefensivePI).all()
    
    if is_dpi:
        #fig = animated_slice(group)
        #fig.show()

## 10 validated visually anih je validirano vizualno i procesiranje je bilo uspješno

## Slijedi računanje bitnih featurea

In [276]:
min_df = clean_df[['time', 'x', 'y', 's', 'event', 'team', 'gameId', 'playId', 'frameId', 'isDefensivePI']]
min_df.columns

Index(['time', 'x', 'y', 's', 'event', 'team', 'gameId', 'playId', 'frameId',
       'isDefensivePI'],
      dtype='object')

In [277]:
gb_6 = clean_df.groupby(['gameId', 'playId', 'frameId'])

In [280]:
len(gb_6)

26155

In [286]:
clean_df.shape

(78434, 49)

In [287]:
def calculate_distance(x1: int, y1: int, x2: int, y2: int) -> int:
    return np.sqrt(np.square(x1 - x2) + np.square(y1 - y2))

In [316]:
calc_df = pd.DataFrame(
    columns=[
        'time', 
        'p1_p2_d', 'p1_b_d', 'p2_b_d', 
        'p1_s', 'p2_s', 'b_s',
        'event', 'gameId', 'playId', 'frameId', 'isDefensivePI'
    ]
)
i = 0
groups = len(gb_6)
for name, group in gb_6:
    game_id, play_id, frameId = name
    
    ball = group[group.team == 'football'].head(1).squeeze()
    p1 = group[group.team == 'home'].head(1).squeeze()
    p2 = group[group.team == 'away'].head(1).squeeze()
    
    group_row = group.head(1).squeeze()
    calc_df = calc_df.append(
        {
            'time': group_row.time, 
            'p1_p2_d': calculate_distance(p1.x, p1.y, p2.x, p2.y), 
            'p1_b_d': calculate_distance(p1.x, p1.y, ball.x, ball.y), 
            'p2_b_d': calculate_distance(p2.x, p2.y, ball.x, ball.y),
            'p1_s': p1.s, 
            'p2_s': p2.s, 
            'b_s': ball.s,
            'event': group_row.event, 
            'gameId': group_row.gameId, 
            'playId': group_row.playId, 
            'frameId': group_row.frameId, 
            'isDefensivePI': group_row.isDefensivePI
        }, 
        ignore_index=True
    )
    
    i +=1
    print(f'{i}/{groups}', end='\r')

26155/26155

In [320]:
calc_df.head(5)

Unnamed: 0,time,p1_p2_d,p1_b_d,p2_b_d,p1_s,p2_s,b_s,event,gameId,playId,frameId,isDefensivePI
0,2018-09-07T01:07:18.099Z,7.76874,16.5824,24.1787,5.53,3.11,2.05,pass_forward,2018090600,75,36,False
1,2018-09-07T01:07:18.200Z,7.46257,17.2778,24.5724,5.87,2.59,1.74,,2018090600,75,37,False
2,2018-09-07T01:07:18.299Z,7.09597,15.235,22.1209,6.15,2.25,23.01,,2018090600,75,38,False
3,2018-09-07T01:07:18.400Z,6.68027,12.5596,19.0074,6.38,2.18,22.63,,2018090600,75,39,False
4,2018-09-07T01:07:18.500Z,6.23705,10.8,16.7732,6.57,2.28,22.09,,2018090600,75,40,False


In [321]:
calc_df.to_csv(os.path.join(INTERIM_DATA_DIR, f'processing_v3_dataframe_week1.csv'), index=False)