In [2]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(fp):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(f'nfl-big-data-bowl-2021/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [3]:
positions.shape

(18309388, 19)

In [4]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [5]:
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [6]:
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [7]:
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [8]:
starting_pos_plays.head()

Unnamed: 0,gameId,playId,position,nflId,team,x,y,playDescription,quarter,down,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,CB,2552689.0,home,82.67,20.53,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,75,CB,2555383.0,home,84.0,43.49,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
2,2018090600,75,FB,2559033.0,away,95.13,26.71,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
3,2018090600,75,FS,2495613.0,home,86.31,22.01,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
4,2018090600,75,FS,2534832.0,home,73.64,28.7,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False


In [9]:
starting_pos_plays.shape

(263173, 32)

In [10]:
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [11]:
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [12]:
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [13]:
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [14]:
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [41]:
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [42]:
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].copy()

In [43]:
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['x'].lt(starting_off_pers['yardline_100']),
                                              starting_off_pers['yardline_100'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['yardline_100']))

In [46]:
starting_pos_count = starting_off_pers.groupby(['gameId', 'playId', 'position', 'team']).apply(lambda x: x.cumsum())['position']

KeyError: 'time'

In [24]:
position = starting_pos_count.map(lambda x: x[:2] + str(int(len(x) / 2)))

In [25]:
starting_off_pers['position'] = position

In [26]:
starting_x = starting_off_pers.pivot_table(columns='position', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1)

In [27]:
starting_y = starting_off_pers.pivot_table(columns='position', index=['gameId', 'playId'], values='y').rename(lambda x: x + '_y', axis=1)

In [28]:
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [29]:
x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]
cols

['CB1_in',
 'DB1_in',
 'DL1_in',
 'FB1_in',
 'HB1_in',
 'HB2_in',
 'IL1_in',
 'K0__in',
 'LB1_in',
 'LS1_in',
 'OL1_in',
 'OL3_in',
 'OL4_in',
 'P0__in',
 'QB1_in',
 'QB2_in',
 'RB1_in',
 'RB2_in',
 'RB3_in',
 'S0__in',
 'SS1_in',
 'TE1_in',
 'TE2_in',
 'TE3_in',
 'TE4_in',
 'WR1_in',
 'WR2_in',
 'WR3_in',
 'WR4_in',
 'WR5_in']

In [30]:
starting_pos[cols] = starting_pos[x_col].notnull()

In [31]:
starting_pos.fillna(0, inplace=True)

In [32]:
data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [33]:
data.drop_duplicates(inplace=True)

In [34]:
data.dropna(axis=0, inplace=True)

In [35]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [36]:
data

Unnamed: 0,CB1_x,DB1_x,DL1_x,FB1_x,HB1_x,HB2_x,IL1_x,K0_x,LB1_x,LS1_x,...,TE1_in,TE2_in,TE3_in,TE4_in,WR1_in,WR2_in,WR3_in,WR4_in,WR5_in,offenseFormation
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,True,False,False,I_FORM
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,True,True,True,False,False,SINGLEBACK
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,True,True,False,False,False,SHOTGUN
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,False,False,True,True,False,False,False,SHOTGUN
52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,True,False,False,False,False,SHOTGUN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,True,True,True,False,False,SHOTGUN
57092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,True,False,False,SHOTGUN
57025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,True,True,False,False,False,EMPTY
56918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,True,True,False,False,EMPTY


In [31]:
y.iloc[:10]

2          I_FORM
19     SINGLEBACK
32        SHOTGUN
45        SHOTGUN
52        SHOTGUN
97     SINGLEBACK
110    SINGLEBACK
123    SINGLEBACK
136    SINGLEBACK
149       SHOTGUN
Name: offenseFormation, dtype: object

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)



array([0.95172911, 0.95028818, 0.96178803, 0.95602019, 0.95890411])

In [35]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)



array([0.95965418, 0.95461095, 0.9704398 , 0.96106705, 0.96539293])

In [36]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.9586281042261008

In [37]:
grid_rfor.best_params_

{'max_depth': None}

In [38]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)



array([0.95028818, 0.94884726, 0.96250901, 0.95746215, 0.95097332])

In [39]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9603582080323293

In [40]:
X_test_scaled = scaler.transform(X_test)

In [41]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.94      0.97      0.96       298
      I_FORM       0.90      0.86      0.88       114
       JUMBO       0.86      0.86      0.86         7
      PISTOL       1.00      0.10      0.18        31
     SHOTGUN       0.98      0.99      0.98      1528
  SINGLEBACK       0.95      0.97      0.96       334
     WILDCAT       0.00      0.00      0.00         1

    accuracy                           0.96      2313
   macro avg       0.80      0.68      0.69      2313
weighted avg       0.96      0.96      0.96      2313

[[ 289    0    0    0    9    0    0]
 [   0   98    0    0    0   16    0]
 [   0    1    6    0    0    0    0]
 [   2    0    0    3   26    0    0]
 [  15    0    0    0 1513    0    0]
 [   0   10    1    0    0  323    0]
 [   0    0    0    0    1    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00       894
      I_FORM       1.00      1.00      1.00       342
       JUMBO       1.00      1.00      1.00        20
      PISTOL       1.00      1.00      1.00        92
     SHOTGUN       1.00      1.00      1.00      4581
  SINGLEBACK       1.00      1.00      1.00      1004
     WILDCAT       1.00      1.00      1.00         4

    accuracy                           1.00      6937
   macro avg       1.00      1.00      1.00      6937
weighted avg       1.00      1.00      1.00      6937

[[ 894    0    0    0    0    0    0]
 [   0  342    0    0    0    0    0]
 [   0    0   20    0    0    0    0]
 [   0    0    0   92    0    0    0]
 [   0    0    0    0 4581    0    0]
 [   0    0    0    0    0 1004    0]
 [   0    0    0    0    0    0    4]]


In [43]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6606139213143104