In [1]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(fp):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(f'nfl-big-data-bowl-2021/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [2]:
positions.shape

(18309388, 19)

In [3]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [4]:
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [5]:
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [6]:
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [7]:
starting_pos_plays = starting_pos_plays[starting_pos_plays['absoluteYardlineNumber'].notnull()]

In [8]:
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [9]:
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [10]:
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [11]:
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [12]:
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [13]:
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [14]:
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

In [15]:
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['x'].lt(starting_off_pers['yardline_100']),
                                              starting_off_pers['yardline_100'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['yardline_100']))
starting_off_pers['y_starting'] = starting_off_pers.groupby(['playId', 'gameId', 'nflId'])['y'].transform(lambda x: x.iloc[0])

In [16]:
starting_pos_count = starting_off_pers.groupby(['gameId', 'playId', 'position', 'team']).apply(lambda x: x.cumsum())['position']

In [17]:
position = starting_pos_count.map(lambda x: x[:2] + str(int(len(x) / 2)))

In [18]:
starting_off_pers['position'] = position

In [19]:
starting_x = (starting_off_pers
    .pivot_table(columns='position', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1))

In [20]:
starting_y = (starting_off_pers
              .pivot_table(columns='position', index=['gameId', 'playId'], values='y_starting').rename(lambda x: x + '_y', axis=1))

In [21]:
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [22]:
x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]

In [23]:
starting_pos[cols] = starting_pos[x_col].notnull()

In [24]:
starting_pos.fillna(0, inplace=True)

In [25]:
data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [26]:
data.head()

Unnamed: 0,CB1_x,CB2_x,DB1_x,DB2_x,DE1_x,DL1_x,DT1_x,FB1_x,FS1_x,HB1_x,...,TE1_in,TE2_in,TE3_in,TE4_in,WR1_in,WR2_in,WR3_in,WR4_in,WR5_in,offenseFormation
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.13,0.0,0.0,...,True,False,False,False,True,True,False,False,False,I_FORM
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.13,0.0,0.0,...,True,False,False,False,True,True,False,False,False,I_FORM
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.13,0.0,0.0,...,True,False,False,False,True,True,False,False,False,I_FORM
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.13,0.0,0.0,...,True,False,False,False,True,True,False,False,False,I_FORM
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.13,0.0,0.0,...,True,False,False,False,True,True,False,False,False,I_FORM


In [27]:
data.drop_duplicates(inplace=True)

In [28]:
data.dropna(axis=0, inplace=True)

In [29]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)



array([0.68227666, 0.69092219, 0.67844268, 0.68276857, 0.68421053])

In [33]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)



array([0.95172911, 0.95317003, 0.95313627, 0.94232156, 0.95385725])

In [34]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.946229837272006

In [35]:
grid_rfor.best_params_

{'max_depth': None}

In [36]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)



array([0.94236311, 0.93804035, 0.94881038, 0.93439077, 0.93943764])

In [37]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9420488154062266

In [38]:
X_test_scaled = scaler.transform(X_test)

In [39]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.97      0.91      0.94       298
      I_FORM       0.89      0.72      0.80       114
       JUMBO       0.60      0.86      0.71         7
      PISTOL       1.00      0.10      0.18        31
     SHOTGUN       0.96      0.99      0.98      1528
  SINGLEBACK       0.90      0.96      0.93       334
     WILDCAT       0.00      0.00      0.00         1

    accuracy                           0.95      2313
   macro avg       0.76      0.65      0.65      2313
weighted avg       0.95      0.95      0.95      2313

[[ 270    0    0    0   26    2    0]
 [   0   82    0    0    1   31    0]
 [   0    1    6    0    0    0    0]
 [   1    1    0    3   26    0    0]
 [   8    0    3    0 1516    1    0]
 [   0    8    1    0    3  322    0]
 [   0    0    0    0    1    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00       894
      I_FORM       1.00      1.00      1.00       342
       JUMBO       1.00      1.00      1.00        20
      PISTOL       1.00      1.00      1.00        92
     SHOTGUN       1.00      1.00      1.00      4581
  SINGLEBACK       1.00      1.00      1.00      1004
     WILDCAT       1.00      1.00      1.00         4

    accuracy                           1.00      6937
   macro avg       1.00      1.00      1.00      6937
weighted avg       1.00      1.00      1.00      6937

[[ 894    0    0    0    0    0    0]
 [   0  342    0    0    0    0    0]
 [   0    0   20    0    0    0    0]
 [   0    0    0   92    0    0    0]
 [   0    0    0    0 4581    0    0]
 [   0    0    0    0    0 1004    0]
 [   0    0    0    0    0    0    4]]


In [41]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6606139213143104