In [125]:
import pandas as pd
import numpy as np

week1 = pd.read_csv('nfl-big-data-bowl-2021/week1.csv')

In [126]:
week1 = week1.copy()

In [127]:
week1['time'] = pd.to_datetime(week1['time'], format='%Y-%m-%dT%H:%M:%S')

In [128]:
week1['time_diff'] = week1.groupby('displayName')['time'].diff()

In [129]:
week1['time_diff'][week1['time_diff'].isnull()] = pd.Timedelta(0)

week1['time_acc_s'] = week1.groupby('displayName')['time_diff'].transform(lambda x: x.map(lambda x: x.microseconds).cumsum()).div(1e6)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  week1['time_diff'][week1['time_diff'].isnull()] = pd.Timedelta(0)


In [130]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [131]:
week1 = week1.merge(plays, on=['playId', 'gameId'])

In [132]:
week1.head()

Unnamed: 0,time,x,y,s,a,dis,o,dir,event,nflId,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018-09-07 01:07:14.599000+00:00,91.73,26.67,0.0,0.01,0.02,289.57,240.93,,310.0,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018-09-07 01:07:14.599000+00:00,88.89,36.47,0.01,0.01,0.01,105.63,66.66,,79848.0,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
2,2018-09-07 01:07:14.599000+00:00,91.35,44.16,0.02,0.03,0.01,290.45,16.86,,2495454.0,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
3,2018-09-07 01:07:14.599000+00:00,86.31,22.01,0.09,0.42,0.01,70.12,168.91,,2495613.0,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
4,2018-09-07 01:07:14.599000+00:00,90.78,36.15,0.0,0.0,0.0,257.61,193.97,,2533040.0,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False


In [133]:
week1['yardline_100'] = week1['absoluteYardlineNumber'].copy()
week1['yardline_first'] = np.where(week1['yardline_100'].gt(week1['x'].max()),
                                  week1['yardline_100'].add(week1['yardsToGo']),
                                  week1['yardline_100'].sub(week1['yardsToGo']))

In [134]:
week1['yardsToGo']

0         15
1         15
2         15
3         15
4         15
          ..
986017     2
986018     2
986019     2
986020     2
986021     2
Name: yardsToGo, Length: 986022, dtype: int64

In [135]:
import altair as alt

alt.themes.enable('fivethirtyeight')
df = week1[week1['playId'] == week1['playId'].min()].drop(['time', 'time_diff'], axis=1)

players = alt.Chart(df).encode(
    x=alt.X('x:Q', title="Yard Line", scale=alt.Scale(zero=True)),
    y=alt.Y('y:Q', axis=None),
    color=alt.Color('displayName:N', title='Player',
                    scale=alt.Scale(scheme='category10'))
).mark_line(width=20)

scrimmage = alt.Chart(df).encode(
    x=alt.X('yardline_100:Q')
).mark_rule(color='blue')

first = alt.Chart(df).encode(
    x=alt.X('yardline_first:Q')
).mark_rule(color='yellow')

In [136]:
((players + scrimmage + first)
 .configure_view(fill='#348C31', fillOpacity=0.7)
 .properties(height=55*6, width=100*6, title=alt.TitleParams('Routes on First Play', anchor='middle')))

In [137]:
week1['time_elapsed'] = week1.groupby('displayName')['time_acc_s'].cumsum()

In [138]:
week1['x_behind_line'] = np.where(week1.groupby(['nflId', 'playId'])['x'].transform(lambda x: x.iloc[0]).gt(week1['absoluteYardlineNumber']),
         week1['x'].rsub(week1['absoluteYardlineNumber']),
         week1['x'].sub(week1['absoluteYardlineNumber']))

**Inputs into route prediction:**
1. Categorical position of player
2. Formation (note: given in dataset, but predictive model needed for real-time)
3. Initial position of player relative to line and in general
4. Initial positions of all defensive players, subdivided by position
5. Line of scrimmage formatted as yards needed for touchdown
6. Yards needed for a first down
7. Time elapsed since beginning of play

In [429]:
starting_pos_count = week1.groupby(['gameId', 'playId', 'team', 'nflId'])['position'].first().reset_index().groupby(
    ['gameId', 'playId', 'team', 'position']
)['position'].apply(lambda x: x.cumsum()).rename({'position': 'position_num'}, axis=1)
starting_idx = week1.groupby(['gameId', 'playId', 'team', 'nflId']).first().reset_index()[['gameId', 'playId', 'team', 'nflId']]
starting_idx['position_num'] = starting_pos_count.values
week1_pos = week1.merge(starting_idx, on=['gameId', 'playId', 'team', 'nflId'])
week1_pos['position_num'] = week1_pos['position_num'].map(lambda x: x[:2] + str(len(x) // 2))

In [431]:
week1_pos['x_starting_behind_line'] = week1_pos.groupby(['gameId', 'playId', 'position'])['x_behind_line'].transform(lambda x: x.iloc[0])

In [432]:
week1_pos['x_starting'] = week1_pos.groupby(['gameId', 'playId', 'position'])['x'].transform(lambda x: x.iloc[0])
week1_pos['y_starting'] = week1_pos.groupby(['gameId', 'playId', 'position'])['y'].transform(lambda x: x.iloc[0])

In [433]:
week1_pos['yards_needed_touch'] = np.where(week1_pos['yardline_100'].gt(week1_pos['yardline_first']),
                                           week1_pos['yardline_100'],
                                           week1_pos['yardline_100'].rsub(100))

In [434]:
off_def = (week1_pos
           .groupby(['gameId', 'playId', 'team'])['position']
           .apply(lambda x: 'QB' in x.unique() or 'WR' in x.unique())
           .reset_index()
           .rename({'position': 'off'}, axis=1)
           )

In [435]:
week1_off_def = week1_pos.merge(off_def, on=['gameId', 'playId', 'team'])

In [436]:
week1_def = week1_off_def[week1_off_def['off'] == False]
week1_off = week1_off_def[week1_off_def['off'] == True]

In [437]:
week1_def_starting = (week1_def
 .pivot_table(
    columns='position_num',
    values=['x_starting', 'x_starting_behind_line', 'y_starting'],
    index=['gameId', 'playId']))

In [438]:
week1_def_starting_cols = ['_'.join(x) for x in week1_def_starting.columns]
week1_def_starting.columns = week1_def_starting_cols

In [439]:
week1_def_starting.reset_index(inplace=True)

In [440]:
week1_off_starting = week1_off.merge(week1_def_starting, on=['gameId', 'playId'])

In [441]:
week1_off_starting.columns

Index(['time', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event', 'nflId',
       ...
       'y_starting_NT1', 'y_starting_OL1', 'y_starting_OL3', 'y_starting_OL4',
       'y_starting_OL6', 'y_starting_OL7', 'y_starting_S0', 'y_starting_SS1',
       'y_starting_SS2', 'y_starting_TE1'],
      dtype='object', length=137)

In [442]:
cols_chosen = ['x', 'y', 'position_num', 'x_starting_behind_line',
               'y_starting', 'yardsToGo', 'yards_needed_touch', 'time_acc_s', 'yardline_first'] + week1_def_starting_cols

In [443]:
X_y = week1_off_starting[cols_chosen]

In [457]:
X = X_y.drop(['x', 'y'], axis=1)
y = X_y[['x', 'y']]

In [458]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [459]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

impute = SimpleImputer(strategy="constant")
scaler = StandardScaler()
onehot = OneHotEncoder()

cat_cols = X_train.columns[X_train.dtypes == 'object']
num_cols = X_train.columns[X_train.dtypes != 'object']

cat_pipe = Pipeline([('one_hot', onehot)])
num_pipe = Pipeline([('impute', impute), ('scaler', scaler)])

full_pipe = ColumnTransformer([('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols)])
X_train_trans = full_pipe.fit_transform(X_train)

In [460]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lin_reg = LinearRegression()
cross_val_score(lin_reg, X_train_trans, y_train, cv=5, scoring='r2')

array([0.48016271, 0.47989667, 0.47978325, 0.48137025, 0.47803811])

In [461]:
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor(random_state=0)
cross_val_score(dtree, X_train_trans, y_train, cv=5, scoring='r2')

array([0.99373808, 0.99310472, 0.99325725, 0.99344381, 0.99261938])

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_samples_split': range(2, 11)}
grid_dtree_x = GridSearchCV(dtree, params, cv=5, scoring='r2')
grid_dtree_x.fit(X_train_trans, y_train)

In [450]:
grid_dtree_x.best_score_

0.9964318038312744

In [451]:
X_test_trans = full_pipe.transform(X_test)

In [452]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

y_pred = grid_dtree_x.predict(X_test_trans)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print('RMSE:' , rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE:', mae)
r2 = r2_score(y_test, y_pred)
print('R2:', r2)

RMSE: 1.4554549118404627
MAE: 0.43122031974810415
R2: 0.9966078135935623


In [455]:
y_train_y = y_train.iloc[:, 1]
y_test_y = y_test.iloc[:, 1]

In [456]:
dtree_y = DecisionTreeRegressor(random_state=0)
cross_val_score(dtree_y, X_train_trans, y_train_y, cv=5)

array([0.99064576, 0.98997035, 0.98975131, 0.98952702, 0.98950014])

In [359]:
lin_reg_y = LinearRegression()
cross_val_score(lin_reg_y, X_train_trans, y_train_y, cv=5)

array([0.09757562, 0.1000148 , 0.0962663 , 0.09981098, 0.09285922])

In [361]:
grid_dtree_y = GridSearchCV(dtree_y, params, cv=5, scoring='r2')
grid_dtree_y.fit(X_train_trans, y_train_y)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'min_samples_split': range(2, 11)}, scoring='r2')

In [362]:
grid_dtree_y.best_score_

0.7198346870470026

In [363]:
grid_dtree_y.best_params_

{'min_samples_split': 10}

In [364]:
params2 = {'min_samples_split': range(10, 21)}
grid_dtree_y2 = GridSearchCV(dtree_y, params2, cv=5, scoring='r2')
grid_dtree_y2.fit(X_train_trans, y_train_y)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'min_samples_split': range(10, 21)}, scoring='r2')

In [365]:
grid_dtree_y2.best_score_

0.7610923550346148

In [366]:
grid_dtree_y2.best_params_

{'min_samples_split': 20}

In [367]:
params3 = {'min_samples_split': range(20, 41, 2)}
grid_dtree_y3 = GridSearchCV(dtree_y, params3, cv=5, scoring='r2')
grid_dtree_y3.fit(X_train_trans, y_train_y)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'min_samples_split': range(20, 41, 2)}, scoring='r2')

In [368]:
grid_dtree_y3.best_params_

{'min_samples_split': 34}

In [369]:
grid_dtree_y3.best_score_

0.7717880338095024

In [370]:
best_model_x = grid_dtree_x
best_model_y = grid_dtree_y3

In [372]:
y_pred_y = best_model_y.predict(X_test_trans)
rmse_y = mean_squared_error(y_test_y, y_pred_y) ** 0.5
print('RMSE:' , rmse_y)
mae_y = mean_absolute_error(y_test_y, y_pred_y)
print('MAE:', mae_y)
r2_y = r2_score(y_test_y, y_pred_y)
print('R2:', r2_y)

RMSE: 5.534329742421087
MAE: 2.662524747472196
R2: 0.7755224685845884


In [393]:
plays = week1_off_starting[cols_chosen + ['playId', 'gameId', 'yardline_100']]

In [394]:
play1 = plays[plays['playId'] == plays['playId'].min()]

In [395]:
X_play1 = play1.drop(['x', 'y', 'playId', 'gameId'], axis=1)
y_play1_x = play1['x']

In [396]:
X_trans_play1 = full_pipe.transform(X_play1)

In [397]:
play1['x_pred'] = best_model_x.predict(X_trans_play1)
play1['y_pred'] = best_model_y.predict(X_trans_play1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  play1['x_pred'] = best_model_x.predict(X_trans_play1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  play1['y_pred'] = best_model_y.predict(X_trans_play1)


In [399]:
play1_pred = play1[['x', 'y', 'position_num', 'x_pred', 'y_pred', 'yardline_first', 'yardline_100']]

In [404]:
chart1 = alt.Chart(play1_pred).encode(x=alt.X('x:Q'), y=alt.Y('y'), color=alt.Color('position_num')).mark_line()

In [406]:
chart1

In [307]:
import statsmodels.api as sm

endog_x = y_train.iloc[:, 0]
exog_x = X_train_trans
mod_x = sm.OLS(endog_x, exog_x)
results_x = mod_x.fit()
print(results_x.summary())

                            OLS Regression Results                            
Dep. Variable:                      x   R-squared:                       0.858
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                 3.934e+04
Date:                Fri, 25 Feb 2022   Prob (F-statistic):               0.00
Time:                        20:37:06   Log-Likelihood:            -1.1011e+06
No. Observations:              300744   AIC:                         2.202e+06
Df Residuals:                  300697   BIC:                         2.203e+06
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1            58.7784      1.390     42.283      0.0

In [266]:
endog_y = y_train.iloc[:, 1]
exog_y = X_train_trans
mod_y = sm.OLS(endog_y, exog_y)
results_y = mod_y.fit()
print(results_y.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     612.4
Date:                Fri, 25 Feb 2022   Prob (F-statistic):               0.00
Time:                        20:32:37   Log-Likelihood:            -1.1494e+06
No. Observations:              300744   AIC:                         2.299e+06
Df Residuals:                  300689   BIC:                         2.300e+06
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1         -4.103e+12   1.61e+12     -2.548      0.0

In [267]:
y_pred_x = results_x.predict(X_test_trans)
y_test_x = y_test.iloc[:, 0]

In [268]:
rmse_x = mean_squared_error(y_test_x, y_pred_x) ** 0.5
mae_x = mean_absolute_error(y_test_x, y_pred_x)
print("RMSE for x model:", rmse_x)
print("MAE for x model:", mae_x)

RMSE for x model: 8.808742911843032
MAE for x model: 6.2646044077173695


In [269]:
y_pred_y = results_y.predict(X_test_trans)
y_test_y = y_test.iloc[:, 1]

In [270]:
rmse_y = mean_squared_error(y_test_y, y_pred_y) ** 0.5
mae_y = mean_absolute_error(y_test_y, y_pred_y)
print("RMSE for y model:", rmse_y)
print("MAE for y model:", mae_y)

RMSE for y model: 11.121118954063988
MAE for y model: 8.543724992188883


In [271]:
ci_results_x = results_x.conf_int(0.1)

In [272]:
ci_results_linspace = ci_results_x.apply(lambda x: np.linspace(x[0], x[1], 10), axis=1).explode()

In [273]:
idx_part = (ci_results_linspace
 .groupby(ci_results_linspace.index)
 .apply(lambda x: range(x.shape[0]))
 .explode().rename({'data': 'idx_part'}, axis=1))
ci_results_linspace = ci_results_linspace.reset_index()

In [274]:
ci_results_linspace['idx_part'] = idx_part.values

In [275]:
ci_results_linspace.rename({0: 'param_val'}, axis=1, inplace=True)
ci_results_linspace = ci_results_linspace.pivot_table(columns='idx_part', values='param_val', index='index')

In [276]:
ci_results_linspace['idx'] = ci_results_linspace.index.str.extract('(\d+)').astype(int).values
ci_results_linspace = ci_results_linspace.sort_values('idx').drop('idx', axis=1)

In [277]:
result_range = X_test_trans @ ci_results_linspace

In [278]:
result_range

idx_part,0,1,2,3,4,5,6,7,8,9
0,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
1,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
2,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
3,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
4,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
...,...,...,...,...,...,...,...,...,...,...
100244,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
100245,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
100246,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12
100247,-4.152091e+12,-3.229404e+12,-2.306717e+12,-1.384030e+12,-4.613434e+11,4.613434e+11,1.384030e+12,2.306717e+12,3.229404e+12,4.152091e+12


In [200]:
results_x.conf_int(0.05)

Unnamed: 0,0,1
x1,58.585366,63.655791
x2,60.088821,60.842807
x3,58.358905,59.251027
x4,58.821125,58.989128
x5,59.009803,59.195778
x6,58.510172,58.653734
x7,59.00478,59.110223
x8,14.513444,14.779341
x9,0.05093,0.144205
x10,-0.135939,-0.07272


In [201]:
ci_results_linspace

idx_part,0,1,2,3,4,5,6,7,8,9
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
x1,58.992963,59.465767,59.93857,60.411374,60.884177,61.35698,61.829784,62.302587,62.775391,63.248194
x10,-0.130857,-0.124962,-0.119067,-0.113172,-0.107277,-0.101382,-0.095487,-0.089592,-0.083697,-0.077802
x11,-1.814158,-1.789007,-1.763857,-1.738706,-1.713556,-1.688405,-1.663255,-1.638104,-1.612954,-1.587804
x12,9.489679,9.630273,9.770867,9.911461,10.052054,10.192648,10.333242,10.473836,10.61443,10.755024
x13,0.181897,0.188157,0.194418,0.200678,0.206938,0.213198,0.219458,0.225718,0.231978,0.238239
x14,-11.767903,-11.613103,-11.458302,-11.303501,-11.1487,-10.9939,-10.839099,-10.684298,-10.529498,-10.374697
x15,8.621393,8.648199,8.675004,8.70181,8.728616,8.755422,8.782228,8.809034,8.83584,8.862645
x16,0.026881,0.040087,0.053294,0.0665,0.079707,0.092913,0.10612,0.119326,0.132533,0.145739
x17,-0.762619,-0.743176,-0.723732,-0.704289,-0.684845,-0.665402,-0.645958,-0.626515,-0.607071,-0.587628
x18,-0.450942,-0.423767,-0.396592,-0.369416,-0.342241,-0.315065,-0.28789,-0.260715,-0.233539,-0.206364


In [189]:
X_test_trans @ results_x.conf_int(alpha=0.05)

Unnamed: 0,0,1
0,51.404999,51.656685
1,86.063966,90.768407
2,40.110251,36.945019
3,51.995652,51.755175
4,78.041109,82.180387
...,...,...
100244,41.760524,38.276722
100245,34.900764,31.909587
100246,25.043493,19.771509
100247,26.506075,21.984185
