In [188]:
import pandas as pd
import numpy as np

week1 = pd.read_csv('nfl-big-data-bowl-2021/week1.csv')

In [189]:
week1['time'] = pd.to_datetime(week1['time'], format='%Y-%m-%dT%H:%M:%S')

In [190]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [191]:
week1 = week1.merge(plays, on=['playId', 'gameId'])

In [192]:
week1['time_diff'] = week1.groupby(['playId', 'gameId', 'displayName'])['time'].diff()
week1['time_diff'][week1['time_diff'].isnull()] = pd.Timedelta(0)
week1['time_acc_s'] = week1.groupby(['playId', 'gameId', 'displayName'])['time_diff'].transform(lambda x: x.map(lambda x: x.microseconds).cumsum()).div(1e6)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  week1['time_diff'][week1['time_diff'].isnull()] = pd.Timedelta(0)


In [193]:
week1['yardline_first'] = np.where(week1['absoluteYardlineNumber'].gt(week1['x'].max()),
                                  week1['absoluteYardlineNumber'].add(week1['yardsToGo']),
                                  week1['absoluteYardlineNumber'].sub(week1['yardsToGo']))

In [194]:
week1['yardsToGo']

0         15
1         15
2         15
3         15
4         15
          ..
986017     2
986018     2
986019     2
986020     2
986021     2
Name: yardsToGo, Length: 986022, dtype: int64

In [195]:
import altair as alt

alt.themes.enable('fivethirtyeight')
df = week1[week1['playId'] == week1['playId'].min()].drop(['time', 'time_diff'], axis=1)

players = alt.Chart(df).encode(
    x=alt.X('x:Q', title="Yard Line", scale=alt.Scale(zero=True)),
    y=alt.Y('y:Q', axis=None),
    color=alt.Color('displayName:N', title='Player',
                    scale=alt.Scale(scheme='category10'))
).mark_line(width=20)

scrimmage = alt.Chart(df).encode(
    x=alt.X('absoluteYardlineNumber:Q')
).mark_rule(color='blue')

first = alt.Chart(df).encode(
    x=alt.X('yardline_first:Q')
).mark_rule(color='yellow')

In [196]:
((players + scrimmage + first)
 .configure_view(fill='#348C31', fillOpacity=0.7)
 .properties(height=55*6, width=100*6, title=alt.TitleParams('Routes on First Play', anchor='middle')))

In [197]:
week1['time_elapsed'] = week1.groupby(['gameId', 'playId', 'nflId'])['time_acc_s'].cumsum()

In [198]:
week1['x_behind_line'] = np.where(week1.groupby(['nflId', 'playId'])['x'].transform(lambda x: x.iloc[0]).gt(week1['absoluteYardlineNumber']),
         week1['x'].rsub(week1['absoluteYardlineNumber']),
         week1['x'].sub(week1['absoluteYardlineNumber']))

**Inputs into route prediction:**
1. Categorical position of player
2. Formation (note: given in dataset, but predictive model needed for real-time)
3. Initial position of player relative to line and in general
4. Initial positions of all defensive players, subdivided by position
5. Line of scrimmage formatted as yards needed for touchdown
6. Yards needed for a first down
7. Time elapsed since beginning of play

In [199]:
starting_pos_count = week1.groupby(['gameId', 'playId', 'team', 'nflId'])['position'].first().reset_index().groupby(
    ['gameId', 'playId', 'team', 'position']
)['position'].apply(lambda x: x.cumsum()).rename({'position': 'position_num'}, axis=1)
starting_idx = week1.groupby(['gameId', 'playId', 'team', 'nflId']).first().reset_index()[['gameId', 'playId', 'team', 'nflId']]
starting_idx['position_num'] = starting_pos_count.values
week1_pos = week1.merge(starting_idx, on=['gameId', 'playId', 'team', 'nflId'])
week1_pos['position_num'] = week1_pos['position_num'].map(lambda x: x[:2] + str(len(x) // 2))

In [200]:
week1_pos['x_starting_behind_line'] = week1_pos.groupby(['gameId', 'playId', 'nflId'])['x_behind_line'].transform(lambda x: x.iloc[0])

In [201]:
week1_pos['y_starting'] = week1_pos.groupby(['gameId', 'playId', 'nflId'])['y'].transform(lambda x: x.iloc[0])

In [202]:
week1_pos['yards_needed_touch'] = np.where(week1_pos['absoluteYardlineNumber'].gt(week1_pos['yardline_first']),
                                           week1_pos['absoluteYardlineNumber'],
                                           week1_pos['absoluteYardlineNumber'].rsub(100))

In [203]:
off_def = (week1_pos
           .groupby(['gameId', 'playId', 'team'])['position']
           .apply(lambda x: 'QB' in x.unique() or 'WR' in x.unique())
           .reset_index()
           .rename({'position': 'off'}, axis=1)
           )

In [204]:
week1_off_def = week1_pos.merge(off_def, on=['gameId', 'playId', 'team'])

In [205]:
week1_def = week1_off_def[week1_off_def['off'] == False]
week1_off = week1_off_def[week1_off_def['off'] == True]

In [206]:
week1_def_starting = (week1_def
 .pivot_table(
    columns='position_num',
    values=['x_starting_behind_line', 'y_starting'],
    index=['gameId', 'playId']))

In [207]:
week1_def_starting_cols = ['_'.join(x) for x in week1_def_starting.columns]
week1_def_starting.columns = week1_def_starting_cols

In [208]:
week1_def_starting.reset_index(inplace=True)

In [209]:
week1_off_starting = week1_off.merge(week1_def_starting, on=['gameId', 'playId'])

In [210]:
cols_chosen = ['x', 'y', 'position_num', 'x_starting_behind_line',
               'y_starting', 'yardsToGo', 'yards_needed_touch', 'time_acc_s', 'yardline_first'] + week1_def_starting_cols

In [211]:
X_y = week1_off_starting.groupby(['nflId', 'playId', 'gameId']).apply(lambda x: x.iloc[1:])[cols_chosen]
X_y.reset_index(inplace=True, drop=True)
X_y = X_y[X_y['position_num'].notnull()]

In [212]:
X = X_y.drop(['x', 'y'], axis=1)
y = X_y[['x', 'y']]

In [213]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [215]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

impute = SimpleImputer(strategy="constant", fill_value=0)
scaler = StandardScaler()
onehot = OneHotEncoder(handle_unknown='ignore')

cat_cols = X_train.columns[X_train.dtypes == 'object']
num_cols = X_train.columns[X_train.dtypes != 'object']

cat_pipe = Pipeline([('one_hot', onehot)])
num_pipe = Pipeline([('impute', impute), ('scaler', scaler)])

full_pipe = ColumnTransformer([('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols)])
X_train_trans = full_pipe.fit_transform(X_train)

In [216]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lin_reg = LinearRegression()
cross_val_score(lin_reg, X_train_trans, y_train, cv=5, scoring='r2')

array([0.72478918, 0.72706   , 0.72916623, 0.72867148, 0.72595438])

In [217]:
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor(random_state=0)
cross_val_score(dtree, X_train_trans, y_train, cv=5, scoring='r2')

array([0.99115362, 0.9913447 , 0.99171329, 0.99197275, 0.99151985])

In [218]:
from sklearn.model_selection import GridSearchCV

params = {'min_samples_split': range(2, 11)}
grid_dtree = GridSearchCV(dtree, params, cv=5, scoring='r2')
grid_dtree.fit(X_train_trans, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'min_samples_split': range(2, 11)}, scoring='r2')

In [219]:
grid_dtree.best_score_

0.9915408414103082

In [220]:
grid_dtree.best_params_

{'min_samples_split': 2}

In [221]:
X_test_trans = full_pipe.transform(X_test)

In [222]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

y_pred = grid_dtree.predict(X_test_trans)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print('RMSE:' , rmse)
mae = mean_absolute_error(y_test, y_pred)
print('MAE:', mae)
r2 = r2_score(y_test, y_pred)
print('R2:', r2)

RMSE: 1.40305672780808
MAE: 0.4253946272480993
R2: 0.9924726276606295


In [223]:
best_model = grid_dtree

In [224]:
import pickle

pickle.dump(best_model, open('ml_pipe/best_model.pkl', 'wb'))
pickle.dump(full_pipe, open('ml_pipe/full_pipe.pkl', 'wb'))

In [225]:
np.save('ml_pipe/chosen_cols.npy', np.array(cols_chosen))

In [40]:
plays = week1_off_starting[cols_chosen + ['playId', 'gameId', 'absoluteYardlineNumber']]

In [142]:
play1 = plays[plays['playId'] == plays['playId'].min()]

In [143]:
X_play1 = play1.drop(['x', 'y', 'playId', 'gameId', 'absoluteYardlineNumber'], axis=1)
y_play1 = play1[['x', 'y']]

In [144]:
X_trans_play1 = full_pipe.transform(X_play1)

In [145]:
X_train_trans.shape

(300744, 103)

In [146]:
play1[['x_pred', 'y_pred']] = best_model.predict(X_trans_play1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [147]:
play1.drop(['x', 'y', 'time_acc_s', 'x_pred', 'y_pred'], axis=1).drop_duplicates()

Unnamed: 0,position_num,x_starting_behind_line,y_starting,yardsToGo,yards_needed_touch,yardline_first,x_starting_CB1,x_starting_CB2,x_starting_CB3,x_starting_CB4,...,y_starting_OL4,y_starting_OL6,y_starting_OL7,y_starting_S0,y_starting_SS1,y_starting_SS2,y_starting_TE1,playId,gameId,yardline_100
33240,QB1,-4.47,23.84,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0
33291,TE1,-1.0,30.25,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0
33342,WR1,-1.25,36.54,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0
33393,WR2,-1.31,14.39,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0
33444,RB1,-4.94,21.2,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0
33495,WR3,-0.78,17.53,10,81.0,71.0,78.36,73.12,74.86,,...,,,,,30.25,21.36,,58,2018090906,81.0


In [148]:
play1_pred = play1[['x', 'y', 'position_num', 'x_pred', 'y_pred', 'yardline_first', 'absoluteYardlineNumber']]

In [149]:
chart1 = alt.Chart(play1_pred).encode(x=alt.X('x:Q'), y=alt.Y('y'), color=alt.Color('position_num')).mark_line()

In [150]:
chart2 = alt.Chart(play1_pred).encode(x=alt.X('x_pred:Q'), y=alt.Y('y_pred:Q'), color=alt.Color('position_num')).mark_line()
(chart1 & chart2).resolve_scale(x='shared')

In [156]:
np.random.seed(0)
[]


play1['x_starting_behind_line'] = play1['x_starting_behind_line'] + np.random.random(size=play1.shape[0])
play1['y_starting'] = play1['y_starting'] + np.random.random(size=play1.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  play1['x_starting_behind_line'] = play1['x_starting_behind_line'] + np.random.random(size=play1.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  play1['y_starting'] = play1['y_starting'] + np.random.random(size=play1.shape[0])


In [157]:
X_play1_alt = play1.drop(['x', 'y', 'playId', 'gameId', 'absoluteYardlineNumber'], axis=1)
y_play1_alt = play1[['x', 'y']]

In [158]:
X_play1_alt_tran = full_pipe.transform(X_play1_alt)
play1[['x_pred', 'y_pred']] = best_model.predict(X_play1_alt_tran)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [159]:
play1_pred_alt = play1[['x', 'y', 'position_num', 'x_pred', 'y_pred', 'yardline_first', 'absoluteYardlineNumber']]

In [160]:
chart2_alt = alt.Chart(play1_pred_alt).encode(x=alt.X('x_pred:Q'), y=alt.Y('y_pred:Q'), color=alt.Color('position_num')).mark_line()
(chart2 & chart2_alt).resolve_scale(x='shared')

In [164]:
play1[play1['position_num'] == 'WR3']['x_starting_behind_line']

33495   -0.613155
33496    0.324385
33497    0.388952
33498    1.143873
33499   -0.195705
33500   -0.298342
33501   -0.579412
33502   -0.747141
33503    1.079059
33504    0.559833
33505    0.790306
33506   -0.216540
33507    0.392820
33508   -0.652089
33509    0.191255
33510    1.174990
33511    0.973010
33512   -0.103682
33513    1.143140
33514   -0.316597
33515    1.118638
33516    1.102755
33517    0.818405
33518    0.480896
33519    0.968576
33520   -0.193959
33521    0.917887
33522    0.455753
33523   -0.753526
33524   -0.085533
33525   -0.483718
33526    1.183659
33527    0.176741
33528    0.214783
33529    0.498945
33530   -0.042831
33531   -0.506199
33532    0.864235
33533   -0.400304
33534    0.242638
33535   -0.331366
33536   -0.584311
33537    0.944383
33538    1.165839
33539    1.141669
33540    1.033111
33541    0.768095
33542   -0.113710
33543   -0.617797
33544    0.034482
33545   -0.315532
Name: x_starting_behind_line, dtype: float64

In [125]:
X_play1['x_starting_behind_line'] == X_play1_alt['x_starting_behind_line']

33240    True
33241    True
33242    True
33243    True
33244    True
         ... 
33541    True
33542    True
33543    True
33544    True
33545    True
Name: x_starting_behind_line, Length: 306, dtype: bool

In [50]:
plays_test = week1_off_starting.iloc[X_test.index]
plays_test_sorted = plays_test.reset_index().sort_values('index')

In [51]:
play1_test =  plays_test_sorted[plays_test_sorted['playId'] == plays_test_sorted['playId'].min()]

In [52]:
play1_test = play1_test[cols_chosen + ['playId', 'gameId', 'absoluteYardlineNumber']]

In [65]:
X_play1 = play1_test.drop(['x', 'y', 'playId', 'gameId', 'absoluteYardlineNumber'], axis=1)
y_play1 = play1_test[['x', 'y']]

In [66]:
X_play1_tran = full_pipe.transform(X_play1)

In [67]:
play1_test[['x_pred', 'y_pred']] = best_model.predict(X_play1_tran)

In [68]:
play1_test

Unnamed: 0,x,y,position_num,x_starting_behind_line,y_starting,yardsToGo,yards_needed_touch,time_acc_s,yardline_first,x_starting_CB1,...,y_starting_OL7,y_starting_S0,y_starting_SS1,y_starting_SS2,y_starting_TE1,playId,gameId,yardline_100,x_pred,y_pred
28164,85.34,23.92,QB1,-4.47,23.84,10,81.0,0.601,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,85.35,23.92
7144,85.33,23.92,QB1,-4.47,23.84,10,81.0,0.700,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,85.33,23.92
32676,86.76,23.85,QB1,-4.47,23.84,10,81.0,2.101,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,87.10,23.50
47888,87.05,23.86,QB1,-4.47,23.84,10,81.0,2.200,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,87.32,23.87
63886,87.96,24.00,QB1,-4.47,23.84,10,81.0,2.601,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,88.10,24.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20928,74.76,16.62,WR3,-0.78,17.53,10,81.0,3.401,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,74.94,16.91
18154,74.64,16.34,WR3,-0.78,17.53,10,81.0,3.500,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,74.55,16.08
25486,74.53,15.38,WR3,-0.78,17.53,10,81.0,4.000,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,74.49,15.49
18849,74.58,15.33,WR3,-0.78,17.53,10,81.0,4.101,71.0,78.36,...,,,30.25,21.36,,58,2018090906,81.0,74.66,15.33


In [75]:
play1_test_pred = play1_test[['x', 'y', 'position_num', 'x_pred', 'y_pred', 'yardline_first', 'absoluteYardlineNumber', 'time_acc_s']]

In [90]:
chart1 = alt.Chart(play1_test_pred).encode(x=alt.X('x:Q'), y=alt.Y('y'), color=alt.Color('position_num')).mark_line()
chart2 = alt.Chart(play1_test_pred).encode(x=alt.X('x_pred:Q'), y=alt.Y('y_pred:Q'), color=alt.Color('position_num')).mark_line()
(chart1 & chart2).resolve_scale(x='shared')

In [77]:
play1_test_pred

Unnamed: 0,x,y,position_num,x_pred,y_pred,yardline_first,yardline_100,time_acc_s
28164,85.34,23.92,QB1,85.35,23.92,71.0,81.0,0.601
7144,85.33,23.92,QB1,85.33,23.92,71.0,81.0,0.700
32676,86.76,23.85,QB1,87.10,23.50,71.0,81.0,2.101
47888,87.05,23.86,QB1,87.32,23.87,71.0,81.0,2.200
63886,87.96,24.00,QB1,88.10,24.07,71.0,81.0,2.601
...,...,...,...,...,...,...,...,...
20928,74.76,16.62,WR3,74.94,16.91,71.0,81.0,3.401
18154,74.64,16.34,WR3,74.55,16.08,71.0,81.0,3.500
25486,74.53,15.38,WR3,74.49,15.49,71.0,81.0,4.000
18849,74.58,15.33,WR3,74.66,15.33,71.0,81.0,4.101


In [80]:
X_play1.drop(['time_acc_s'], axis=1).drop_duplicates()

Unnamed: 0,position_num,x_starting_behind_line,y_starting,yardsToGo,yards_needed_touch,yardline_first,x_starting_CB1,x_starting_CB2,x_starting_CB3,x_starting_CB4,...,y_starting_NT1,y_starting_OL1,y_starting_OL3,y_starting_OL4,y_starting_OL6,y_starting_OL7,y_starting_S0,y_starting_SS1,y_starting_SS2,y_starting_TE1
28164,QB1,-4.47,23.84,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
41450,TE1,-1.0,30.25,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
93415,WR1,-1.25,36.54,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
90260,WR2,-1.31,14.39,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
94836,RB1,-4.94,21.2,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
28004,WR3,-0.78,17.53,10,81.0,71.0,78.36,73.12,74.86,,...,,27.02,,,,,,30.25,21.36,
