In [1]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(fp):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(f'nfl-big-data-bowl-2021/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [2]:
positions.shape

(18309388, 19)

In [3]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [4]:
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [5]:
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [6]:
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [7]:
starting_pos_plays.head()

Unnamed: 0,gameId,playId,position,nflId,team,x,y,playDescription,quarter,down,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,CB,2552689.0,home,82.67,20.53,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,75,CB,2555383.0,home,84.0,43.49,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
2,2018090600,75,FB,2559033.0,away,95.13,26.71,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
3,2018090600,75,FS,2495613.0,home,86.31,22.01,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
4,2018090600,75,FS,2534832.0,home,73.64,28.7,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False


In [8]:
starting_pos_plays.shape

(263173, 32)

In [9]:
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [10]:
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [11]:
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [12]:
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [13]:
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [14]:
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [15]:
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

In [16]:
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['x'].lt(starting_off_pers['yardline_100']),
                                              starting_off_pers['yardline_100'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['yardline_100']))
starting_off_pers['y_starting'] = starting_off_pers.groupby(['playId', 'gameId', 'nflId'])['y'].transform(lambda x: x.iloc[0])

In [17]:
starting_pos_count = starting_off_pers.groupby(['gameId', 'playId', 'position', 'team']).apply(lambda x: x.cumsum())['position']

In [18]:
position = starting_pos_count.map(lambda x: x[:2] + str(int(len(x) / 2)))

In [19]:
starting_off_pers['position'] = position

In [20]:
starting_x = (starting_off_pers
    .pivot_table(columns='position', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1))

In [21]:
starting_y = (starting_off_pers
              .pivot_table(columns='position', index=['gameId', 'playId'], values='y_starting').rename(lambda x: x + '_y', axis=1))

In [22]:
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [23]:
x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]

['CB1_in',
 'CB2_in',
 'DB1_in',
 'DB2_in',
 'DE1_in',
 'DL1_in',
 'DT1_in',
 'FB1_in',
 'FS1_in',
 'HB1_in',
 'HB2_in',
 'IL1_in',
 'IL3_in',
 'K0__in',
 'LB1_in',
 'LB2_in',
 'LB3_in',
 'LS1_in',
 'NT1_in',
 'OL1_in',
 'OL3_in',
 'OL4_in',
 'P0__in',
 'QB1_in',
 'QB2_in',
 'RB1_in',
 'RB2_in',
 'RB3_in',
 'S0__in',
 'SS1_in',
 'SS2_in',
 'TE1_in',
 'TE2_in',
 'TE3_in',
 'TE4_in',
 'WR1_in',
 'WR2_in',
 'WR3_in',
 'WR4_in',
 'WR5_in']

In [24]:
starting_pos[cols] = starting_pos[x_col].notnull()

In [25]:
starting_pos.fillna(0, inplace=True)

In [26]:
data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [27]:
data.drop_duplicates(inplace=True)

In [28]:
data.dropna(axis=0, inplace=True)

In [29]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)



array([0.69596542, 0.68371758, 0.68781543, 0.68493151, 0.69286229])

In [33]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)



array([0.95028818, 0.94092219, 0.95962509, 0.94664744, 0.95457823])

In [34]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.9449330697412099

In [35]:
grid_rfor.best_params_

{'max_depth': None}

In [36]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)



array([0.93731988, 0.9445245 , 0.9516943 , 0.9444845 , 0.93583273])

In [37]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9387349024195523

In [38]:
X_test_scaled = scaler.transform(X_test)

In [39]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.96      0.93      0.94       298
      I_FORM       0.84      0.80      0.82       114
       JUMBO       0.67      0.86      0.75         7
      PISTOL       1.00      0.06      0.12        31
     SHOTGUN       0.97      0.99      0.98      1528
  SINGLEBACK       0.93      0.95      0.94       334
     WILDCAT       0.00      0.00      0.00         1

    accuracy                           0.95      2313
   macro avg       0.77      0.66      0.65      2313
weighted avg       0.95      0.95      0.95      2313

[[ 276    0    0    0   22    0    0]
 [   0   91    0    0    0   23    0]
 [   0    1    6    0    0    0    0]
 [   1    2    0    2   26    0    0]
 [  11    0    2    0 1515    0    0]
 [   0   14    1    0    2  317    0]
 [   0    0    0    0    1    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00       894
      I_FORM       1.00      1.00      1.00       342
       JUMBO       1.00      1.00      1.00        20
      PISTOL       1.00      1.00      1.00        92
     SHOTGUN       1.00      1.00      1.00      4581
  SINGLEBACK       1.00      1.00      1.00      1004
     WILDCAT       1.00      1.00      1.00         4

    accuracy                           1.00      6937
   macro avg       1.00      1.00      1.00      6937
weighted avg       1.00      1.00      1.00      6937

[[ 894    0    0    0    0    0    0]
 [   0  342    0    0    0    0    0]
 [   0    0   20    0    0    0    0]
 [   0    0    0   92    0    0    0]
 [   0    0    0    0 4581    0    0]
 [   0    0    0    0    0 1004    0]
 [   0    0    0    0    0    0    4]]


In [41]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6606139213143104