In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb

In [5]:
tracking1 = pd.read_csv('./data/nfl-big-data-bowl-2021/week1.csv')
tracking2 = pd.read_csv('./data/nfl-big-data-bowl-2021/week2.csv')
tracking3 = pd.read_csv('./data/nfl-big-data-bowl-2021/week3.csv')
tracking4 = pd.read_csv('./data/nfl-big-data-bowl-2021/week4.csv')
tracking5 = pd.read_csv('./data/nfl-big-data-bowl-2021/week5.csv')
tracking6 = pd.read_csv('./data/nfl-big-data-bowl-2021/week6.csv')
tracking7 = pd.read_csv('./data/nfl-big-data-bowl-2021/week7.csv')
tracking8 = pd.read_csv('./data/nfl-big-data-bowl-2021/week8.csv')
# tracking9 = pd.read_csv('./data/nfl-big-data-bowl-2021/week9.csv')
# tracking10 = pd.read_csv('./data/nfl-big-data-bowl-2021/week10.csv')
# tracking11 = pd.read_csv('./data/nfl-big-data-bowl-2021/week11.csv')
# tracking12 = pd.read_csv('./data/nfl-big-data-bowl-2021/week12.csv')
# tracking13 = pd.read_csv('./data/nfl-big-data-bowl-2021/week13.csv')
# tracking14 = pd.read_csv('./data/nfl-big-data-bowl-2021/week14.csv')
# tracking15 = pd.read_csv('./data/nfl-big-data-bowl-2021/week15.csv')
# tracking16 = pd.read_csv('./data/nfl-big-data-bowl-2021/week16.csv')
# tracking17 = pd.read_csv('./data/nfl-big-data-bowl-2021/week17.csv')
df = pd.concat([tracking1, 
                      tracking2, 
                      tracking3, 
                      tracking4,
                      tracking5,
                      tracking6, 
                      tracking7, 
                      tracking8
                     ], axis=0)


# Create Dataset

In [119]:
defense_positions = ['SS', 'FS', 'MLB', 'CB', 'LB', 'OLB', 'ILB', 'DL', 'DB', 'NT', 'S', 'DE']
pass_arrived_time = ['pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_interception', 'pass_outcome_touchdown']

### Get Targeted Receiver

In [122]:
pass_arrived = df[df['event'].isin(pass_arrived_time)]
ball_positions = pass_arrived[pass_arrived['nflId'].isna()][['gameId', 'playId', 'x', 'y']].rename(columns={'x': 'ball_x', 'y': 'ball_y'})
pass_arrived = pass_arrived.merge(ball_positions, on=['gameId', 'playId'], how='left')
pass_arrived['distance_to_ball'] = np.sqrt((pass_arrived['x'] - pass_arrived['ball_x'])**2 + (pass_arrived['y'] - pass_arrived['ball_y'])**2)
pass_arrived = pass_arrived[~pass_arrived['position'].isin(defense_positions)]
pass_arrived = pass_arrived[pass_arrived['nflId'].notna()]
pass_arrived = pass_arrived[pass_arrived['distance_to_ball'].notna()]
closest_players = pass_arrived.loc[pass_arrived.groupby(['gameId', 'playId'])['distance_to_ball'].idxmin()][['gameId', 'playId', 'nflId', 'distance_to_ball']]
closest_players = closest_players.rename(columns={'nflId':'target_receiver_nflId'})
closest_players = closest_players.drop('distance_to_ball', axis=1)

### Get Targeted Reciever Location When Ball Thrown

In [125]:
targeted_receiver_thrown_time_location = df.merge(closest_players, left_on=['gameId', 'playId', 'nflId'], right_on=['gameId', 'playId', 'target_receiver_nflId'])

In [126]:
targeted_receiver_thrown_time_location = targeted_receiver_thrown_time_location[targeted_receiver_thrown_time_location['event'] == 'pass_forward'][['gameId', 
                                                                                                                                                    'playId',
                                                                                                                                                    'x', 
                                                                                                                                                    'y', 
                                                                                                                                                    's', 
                                                                                                                                                    'a', 
                                                                                                                                                    'dis', 
                                                                                                                                                    'o', 
                                                                                                                                                    'dir', 
                                                                                                                                                    'target_receiver_nflId']]

In [127]:
targeted_receiver_thrown_time_location = targeted_receiver_thrown_time_location.rename(columns={'x':'x_targeted_receiver',
                                                      'y':'y_targeted_receiver',
                                                      's':'s_targeted_receiver',
                                                      'a':'a_targeted_receiver',
                                                      'dis':'dis_targeted_receiver',
                                                      'o':'o_targeted_receiver',
                                                      'dir':'dir_targeted_receiver'})

### Get Pass Forward Locations

In [132]:
non_qb_rushers_blockers_positions = ['SS', 'WR', 'FS', 'RB', 'MLB', 'CB', 'TE', 'LB', 'OLB', 'HB', 'ILB', 'DB', 'S']

pass_thrown_time = ['pass_forward']

pass_thrown_locations = df[df.isin(non_qb_rushers_blockers_positions)]

pass_thrown_locations = df[df['event'].isin(pass_thrown_time)][['gameId', 'playId','x', 'y', 's', 'a', 'dis', 'o', 'dir', 'nflId']]

pass_thrown_locations = pass_thrown_locations.dropna()


In [133]:
### Get Pass Arrived Locations

pass_arrived_time = ['pass_outcome_caught', 'pass_outcome_incomplete', 'pass_outcome_interception', 'pass_outcome_touchdown']

pass_arrived_locations = df[df.isin(non_qb_rushers_blockers_positions)]

pass_arrived_locations = df[df['event'].isin(pass_arrived_time)][['gameId','playId','x', 'y', 's', 'a', 'dis', 'o', 'dir', 'nflId']]

pass_arrived_locations = pass_arrived_locations.dropna()

### Merge pass forward, pass arrived, and targeted receiver data

In [225]:
data = pass_thrown_locations.merge(pass_arrived_locations, on=['gameId', 'playId', 'nflId'], suffixes=('_thrown_time', '_arrived_time'))

In [227]:
data = data.merge(targeted_receiver_thrown_time_location, 
           how='left',
           left_on=['gameId', 'playId'], 
           right_on=['gameId', 'playId'])

# Build Model For All Players On Field

In [230]:
y = data[[col for col in data.columns if col.endswith('_arrived_time')]]

X = data[[col for col in data.columns if col not in y.columns]]

#X = X.drop(['gameId', 'playId', 'nflId', 'target_receiver_nflId'], axis=1)

In [232]:
X['o_diff_from_targeted_receiver'] = X['o_thrown_time'] - X['o_targeted_receiver']

X['dir_diff_from_targeted_receiver'] = X['dir_thrown_time'] - X['dir_targeted_receiver']

X['s_diff_from_targeted_receiver'] = X['s_thrown_time'] - X['s_targeted_receiver']

X['a_diff_from_targeted_receiver'] = X['a_thrown_time'] - X['a_targeted_receiver']

X['dist_from_targeted_receiver'] = np.sqrt(((X['x_thrown_time'] - X['x_targeted_receiver']) ** 2) + ((X['y_thrown_time'] - X['y_targeted_receiver']) ** 2))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['o_diff_from_targeted_receiver'] = X['o_thrown_time'] - X['o_targeted_receiver']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['dir_diff_from_targeted_receiver'] = X['dir_thrown_time'] - X['dir_targeted_receiver']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['s_diff_from_targeted_receiver'

In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [236]:
X_train = X_train.drop(['gameId', 'playId', 'nflId', 'target_receiver_nflId'], axis=1)
X_test = X_test.drop(['gameId', 'playId', 'nflId', 'target_receiver_nflId'], axis=1)

### LightGBM

In [239]:
pipeline = Pipeline([
    ('model', MultiOutputRegressor(lgb.LGBMRegressor()))  # Model step
])

param_grid = {
    'model__estimator__objective': ['regression'],
    'model__estimator__verbosity': [-1],
    'model__estimator__n_estimators': [50],
    'model__estimator__learning_rate': [0.1],
    'model__estimator__num_leaves': [50]
}


grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           scoring='r2', cv=5)

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)



### Evaluate Model

In [242]:
r2_scores = {}
for i, column in enumerate(y_test.columns):
    r2_scores[column] = r2_score(y_test[column], y_pred[:, i])

for target, score in r2_scores.items():
    print(f"Target {target}: R² = {score}")

Target x_arrived_time: R² = 0.9922638903272535
Target y_arrived_time: R² = 0.9657788543168788
Target s_arrived_time: R² = 0.45972711889928985
Target a_arrived_time: R² = 0.2802402769571213
Target dis_arrived_time: R² = 0.4722118869322669
Target o_arrived_time: R² = 0.2674626858348996
Target dir_arrived_time: R² = 0.29400558795331033


### Analyze Output

In [246]:
X_test

Unnamed: 0,x_thrown_time,y_thrown_time,s_thrown_time,a_thrown_time,dis_thrown_time,o_thrown_time,dir_thrown_time,x_targeted_receiver,y_targeted_receiver,s_targeted_receiver,a_targeted_receiver,dis_targeted_receiver,o_targeted_receiver,dir_targeted_receiver,o_diff_from_targeted_receiver,dir_diff_from_targeted_receiver,s_diff_from_targeted_receiver,a_diff_from_targeted_receiver,dist_from_targeted_receiver
101912,69.14,7.90,9.47,2.07,0.95,71.35,91.08,69.14,7.90,9.47,2.07,0.95,71.35,91.08,0.00,0.00,0.00,0.00,0.000000
59863,88.41,37.98,5.53,4.49,0.56,64.18,59.01,79.08,34.33,7.24,2.54,0.72,32.86,333.74,31.32,-274.73,-1.71,1.95,10.018553
77282,91.37,7.54,7.19,1.48,0.71,148.61,107.12,89.81,28.15,6.57,6.77,0.67,206.57,136.83,-57.96,-29.71,0.62,-5.29,20.668955
95297,89.41,10.19,3.09,4.19,0.33,37.70,97.42,91.90,13.19,1.61,6.11,0.18,251.71,158.06,-214.01,-60.64,1.48,-1.92,3.898731
45815,84.08,20.59,2.37,5.03,0.22,270.08,300.85,90.19,37.41,7.30,1.20,0.73,153.97,102.83,116.11,198.02,-4.93,3.83,17.895376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59844,62.10,43.47,5.98,2.13,0.61,108.43,100.42,59.89,26.00,3.74,3.10,0.39,210.77,124.41,-102.34,-23.99,2.24,-0.97,17.609231
44882,95.87,28.21,2.51,1.39,0.26,35.67,56.77,80.42,39.21,5.59,3.16,0.58,216.55,280.96,-180.88,-224.19,-3.08,-1.77,18.965825
29983,50.50,15.49,4.77,1.43,0.48,172.07,162.70,60.17,1.48,2.77,2.31,0.29,283.47,251.70,-111.40,-89.00,2.00,-0.88,17.023190
45966,52.57,29.53,1.17,3.79,0.11,239.21,132.99,44.75,23.29,4.24,0.91,0.43,298.08,87.65,-58.87,45.34,-3.07,2.88,10.004499


In [248]:
index_to_evaluate = 59863

In [250]:
grid_search.predict(X_test.loc[[index_to_evaluate]])

array([[ 92.86132479,  41.85623384,   4.21224338,   3.19488887,
          0.44728069, 198.01982844, 148.02856965]])

In [252]:
X_test.loc[[index_to_evaluate]]

Unnamed: 0,x_thrown_time,y_thrown_time,s_thrown_time,a_thrown_time,dis_thrown_time,o_thrown_time,dir_thrown_time,x_targeted_receiver,y_targeted_receiver,s_targeted_receiver,a_targeted_receiver,dis_targeted_receiver,o_targeted_receiver,dir_targeted_receiver,o_diff_from_targeted_receiver,dir_diff_from_targeted_receiver,s_diff_from_targeted_receiver,a_diff_from_targeted_receiver,dist_from_targeted_receiver
59863,88.41,37.98,5.53,4.49,0.56,64.18,59.01,79.08,34.33,7.24,2.54,0.72,32.86,333.74,31.32,-274.73,-1.71,1.95,10.018553


In [254]:
y_test.loc[[index_to_evaluate]]

Unnamed: 0,x_arrived_time,y_arrived_time,s_arrived_time,a_arrived_time,dis_arrived_time,o_arrived_time,dir_arrived_time
59863,93.1,39.6,4.05,2.46,0.42,330.5,76.0


# Finding Depth Of Target

In [428]:
play_direction = df[['gameId', 'playId', 'playDirection']].drop_duplicates()

plays = pd.read_csv('./data/nfl-big-data-bowl-2021/plays.csv')

plays = plays[['gameId', 'playId', 'yardlineNumber', 'absoluteYardlineNumber']]

X_with_yards = X.merge(plays, left_on=['gameId', 'playId'], right_on=['gameId', 'playId'])

X_with_yards = X_with_yards.merge(play_direction, left_on=['gameId', 'playId'], right_on=['gameId', 'playId'])

In [429]:
location_predictions = pd.DataFrame(grid_search.predict(X_with_yards[grid_search.feature_names_in_]), columns=y_test.columns)

play_start_info = X_with_yards[['yardlineNumber', 'absoluteYardlineNumber', 'playDirection', 'nflId', 'target_receiver_nflId']]

dot_df = location_predictions.join(play_start_info)

dot_df = dot_df[dot_df['nflId'] == dot_df['target_receiver_nflId']]

In [430]:
dot_df[dot_df['playDirection'] == 'right']

Unnamed: 0,x_arrived_time,y_arrived_time,s_arrived_time,a_arrived_time,dis_arrived_time,o_arrived_time,dir_arrived_time,yardlineNumber,absoluteYardlineNumber,playDirection,nflId,target_receiver_nflId
71,19.241409,49.422817,5.356701,2.730257,0.534120,218.403735,189.930071,4,14.0,right,2552600.0,2552600.0
77,23.563425,47.910394,5.462609,2.613332,0.549695,227.605653,188.661484,8,18.0,right,2506467.0,2506467.0
164,52.068463,39.940213,3.610765,2.497932,0.365378,192.957457,186.525095,37,47.0,right,2552600.0,2552600.0
189,21.151342,40.614181,3.647690,2.586474,0.386938,133.051196,185.829766,18,28.0,right,2543583.0,2543583.0
305,62.697269,47.858900,5.009226,2.701374,0.496575,202.120518,214.214258,27,37.0,right,2495454.0,2495454.0
...,...,...,...,...,...,...,...,...,...,...,...,...
103557,46.414938,28.105603,5.712899,2.703576,0.589466,192.528976,166.593109,25,35.0,right,2495454.0,2495454.0
103582,50.640491,7.583312,4.445490,2.548891,0.442147,214.629907,177.538505,40,50.0,right,2560854.0,2560854.0
103598,65.679573,5.740680,2.636309,2.458396,0.272795,214.626501,179.742294,45,55.0,right,2560854.0,2560854.0
103607,65.048867,41.725630,6.786053,2.459775,0.685646,212.723931,77.744652,45,65.0,right,2552418.0,2552418.0


In [434]:
dot_df[dot_df['playDirection'] == 'left']

Unnamed: 0,x_arrived_time,y_arrived_time,s_arrived_time,a_arrived_time,dis_arrived_time,o_arrived_time,dir_arrived_time,yardlineNumber,absoluteYardlineNumber,playDirection,nflId,target_receiver_nflId
2,78.920312,48.099028,3.085539,2.223917,0.312922,137.512622,137.874800,20,90.0,left,2495454.0,2495454.0
18,44.248250,33.266638,4.679571,2.384242,0.469935,209.469926,247.692236,39,49.0,left,2552418.0,2552418.0
31,50.687940,8.521576,4.318805,2.437028,0.450821,207.281998,202.128159,39,49.0,left,2543583.0,2543583.0
41,27.303922,12.099759,6.075998,2.724712,0.631208,190.461783,225.804973,39,49.0,left,2495454.0,2495454.0
56,7.626714,41.267420,4.332839,2.427056,0.468315,150.814026,166.092619,1,11.0,left,2543583.0,2543583.0
...,...,...,...,...,...,...,...,...,...,...,...,...
103636,47.013308,1.718071,5.574914,2.683263,0.584320,230.542958,256.369225,28,82.0,left,2555295.0,2555295.0
103654,6.883066,48.128817,6.238726,2.743917,0.635819,208.055121,295.300195,14,24.0,left,2557858.0,2557858.0
103665,19.619838,35.431682,5.609359,2.789232,0.574681,166.801546,183.139311,14,24.0,left,2555295.0,2555295.0
103673,11.922854,40.772351,4.335746,2.398035,0.464000,156.740265,218.245162,8,18.0,left,2532835.0,2532835.0


In [438]:
plays_test['playDescription'][71]

'(4:32) (Shotgun) N.Foles pass short right to D.Sproles pushed ob at ATL 37 for 6 yards (R.Allen).'