In [2]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(fp):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(f'nfl-big-data-bowl-2021/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [3]:
positions.shape

(18309388, 19)

In [4]:
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [5]:
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [6]:
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [7]:
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [8]:
starting_pos_plays.head()

Unnamed: 0,gameId,playId,position,nflId,team,x,y,playDescription,quarter,down,...,preSnapHomeScore,gameClock,absoluteYardlineNumber,penaltyCodes,penaltyJerseyNumbers,passResult,offensePlayResult,playResult,epa,isDefensivePI
0,2018090600,75,CB,2552689.0,home,82.67,20.53,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
1,2018090600,75,CB,2555383.0,home,84.0,43.49,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
2,2018090600,75,FB,2559033.0,away,95.13,26.71,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
3,2018090600,75,FS,2495613.0,home,86.31,22.01,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False
4,2018090600,75,FS,2534832.0,home,73.64,28.7,(15:00) M.Ryan pass short right to J.Jones pus...,1,1,...,0.0,15:00:00,90.0,,,C,10,10,0.261827,False


In [9]:
starting_pos_plays.shape

(263173, 32)

In [10]:
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [11]:
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [12]:
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [13]:
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [14]:
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [15]:
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [16]:
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

In [17]:
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['x'].lt(starting_off_pers['yardline_100']),
                                              starting_off_pers['yardline_100'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['yardline_100']))

In [18]:
starting_pos_count = starting_off_pers.groupby(['gameId', 'playId', 'position', 'team']).apply(lambda x: x.cumsum())['position']

In [19]:
position = starting_pos_count.map(lambda x: x[:2] + str(int(len(x) / 2)))

In [20]:
starting_off_pers['position'] = position

In [21]:
starting_x = starting_off_pers.pivot_table(columns='position', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1)

In [22]:
starting_y = starting_off_pers.pivot_table(columns='position', index=['gameId', 'playId'], values='y').rename(lambda x: x + '_y', axis=1)

In [23]:
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [24]:
perc_null = starting_pos.isnull().sum() / starting_pos.shape[0]

In [25]:
starting_pos_notnull = starting_pos[perc_null[perc_null < 0.5].index]

In [26]:
x_col = starting_pos_notnull.columns[starting_pos_notnull.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]
cols

['QB1_in', 'RB1_in', 'TE1_in', 'WR1_in', 'WR2_in', 'WR3_in']

In [27]:
starting_pos_notnull[cols] = starting_pos_notnull[x_col].notnull()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [28]:
starting_pos_notnull.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [29]:
data = starting_pos_notnull.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [30]:
data.isnull().sum()

QB1_x                 0
RB1_x                 0
TE1_x                 0
WR1_x                 0
WR2_x                 0
WR3_x                 0
QB1_y                 0
RB1_y                 0
TE1_y                 0
WR1_y                 0
WR2_y                 0
WR3_y                 0
QB1_in                0
RB1_in                0
TE1_in                0
WR1_in                0
WR2_in                0
WR3_in                0
offenseFormation    304
dtype: int64

In [31]:
data.dropna(axis=0, inplace=True)

In [32]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)

array([0.66927522, 0.67080072, 0.66959663, 0.67116195, 0.67670078])

In [36]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)

array([0.99891645, 0.99963877, 0.9986755 , 0.99891632, 1.        ])

In [37]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier()
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=5)
grid_rfor.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 1, 2, 3, 4]})

In [38]:
grid_rfor.best_score_

0.9993498066795755

In [39]:
X_test_scaled = scaler.transform(X_test)

In [40]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00      1787
      I_FORM       1.00      1.00      1.00       681
       JUMBO       1.00      1.00      1.00        36
      PISTOL       1.00      1.00      1.00       185
     SHOTGUN       1.00      1.00      1.00      9154
  SINGLEBACK       1.00      1.00      1.00      1992
     WILDCAT       1.00      1.00      1.00         7

    accuracy                           1.00     13842
   macro avg       1.00      1.00      1.00     13842
weighted avg       1.00      1.00      1.00     13842

[[1787    0    0    0    0    0    0]
 [   0  681    0    0    0    0    0]
 [   0    0   36    0    0    0    0]
 [   0    0    0  185    0    0    0]
 [   0    0    0    0 9154    0    0]
 [   0    0    0    0    0 1992    0]
 [   0    0    0    0    0    0    7]]


In [41]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00      5362
      I_FORM       1.00      1.00      1.00      2042
       JUMBO       1.00      1.00      1.00       109
      PISTOL       1.00      1.00      1.00       553
     SHOTGUN       1.00      1.00      1.00     27460
  SINGLEBACK       1.00      1.00      1.00      5977
     WILDCAT       1.00      1.00      1.00        23

    accuracy                           1.00     41526
   macro avg       1.00      1.00      1.00     41526
weighted avg       1.00      1.00      1.00     41526

[[ 5362     0     0     0     0     0     0]
 [    0  2042     0     0     0     0     0]
 [    0     0   109     0     0     0     0]
 [    0     0     0   553     0     0     0]
 [    0     0     0     0 27460     0     0]
 [    0     0     0     0     0  5977     0]
 [    0     0     0     0     0     0    23]]
