In [1]:
## instructions on getting data through Kaggle API. API key is read from user/'your name'/.kaggle directory on a mac.
## https://www.kaggle.com/general/74235
import os

!pip install kaggle
!kaggle competitions download -c 'nfl-big-data-bowl-2021'
import zipfile

cwd = str(os.getcwd())
with zipfile.ZipFile(cwd + '/nfl-big-data-bowl-2021.zip', 'r') as zip_ref:
    zip_ref.extractall(cwd + '/Kaggle-Data-Files')

401 - Unauthorized


In [2]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(dir):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(cwd + f'/Kaggle-Data-Files/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [3]:
positions.shape

(18309388, 19)

In [4]:
# reading plays (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [5]:
#time to datetime
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [6]:
# Get starting position of offensive players
# I Think this should be where frame == 1, not .first in case of data quality issues.
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [7]:
# merging play data (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [8]:
# data cleaning where yardline is not Null
starting_pos_plays = starting_pos_plays[starting_pos_plays['absoluteYardlineNumber'].notnull()]

In [9]:
# bring in game info (see game info data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [10]:
#bringing in features from games
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [11]:
#naming which team has the ball as offense or defense
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [12]:
#starting position from offense players 
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [13]:
# What personal is on the field
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [14]:
# Adding that as a feature in the new DF
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [15]:
# Subtracting 10 because the endzone adds 10 years to field 
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

In [16]:
# If position X is less than yardline100, return yardline100 - starting position, else, starting position - yardline. 
# This gets # of yards behind line no matter which way they are facing.

# Y starting is the y coords of the starting position.
starting_off_pers['off_pos'] = np.where(starting_off_pers['x'].lt(starting_off_pers['absoluteYardlineNumber']), 'left', 'right')
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['off_pos'] == 'right',
                                              starting_off_pers['absoluteYardlineNumber'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['absoluteYardlineNumber']))
starting_off_pers['y_starting'] = np.where(starting_off_pers['off_pos'] == 'right',
                                           starting_off_pers['y'].rsub(53.3), starting_off_pers['y'])

In [17]:
def find_rank(df, col, reverse=False):
    """
    Find the ranking of a series based on values.
    :param df: Dataframe for ranking; pd.DataFrame
    :param col: Column from dataframe to rank; str
    :param reverse: Flag of whether to reverse rank direction; bool
    :return: Array with rankings; np.array
    """
    # Extract series and use arsort to find rankings.
    ser = df[col]
    temp = np.argsort(ser)

    # Reverse direction based on flag.
    if reverse:
        temp = temp[::-1]

    # Fill ranking array.
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(ser.shape[0])
    return ranks

In [18]:
# Find the order of positions based on offensive direction.
# First, group and extract first value of the y starting position and direction.
pos_start = (starting_off_pers
             .groupby(['gameId', 'playId', 'position', 'nflId'])
             [['y_starting', 'x', 'off_pos']].first()
             .reset_index())

# Next, group and extract ranking of positions based on whether team is home or away
# and the starting position.
pos_order = (pos_start.groupby(['gameId', 'playId', 'position'])
             .apply(lambda x: np.where(x['off_pos'] == 'right', find_rank(x, 'y_starting'),
                                       find_rank(x, 'y_starting')))
             .explode()
             .reset_index()
             )

# Add column with the position order to the df with indexed starting position.
pos_start['pos_order'] = pos_order[0]

In [19]:
# Add number of position to position label to get position number.
pos_start['pos_num'] = pos_start.apply(lambda x: x['position'] + str(x['pos_order']), axis=1)

In [20]:
#Adding a label of the players position (WR1, WR2). This makes sense from a numerical stand point, but shouldn't be used
#to classify a team's WR1 WR2 etc.

starting_off_pers = starting_off_pers.merge(pos_start[['gameId', 'playId', 'nflId', 'pos_num', 'pos_order']],
                                            on=['gameId', 'playId', 'nflId'])

In [21]:
# Convert to matrix of GameID and PlayID. Grab number of yards behind line for each player. 
starting_x = (starting_off_pers
    .pivot_table(columns='pos_num', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1))

In [22]:
#Same as above, but for Y coords.
starting_y = (starting_off_pers
              .pivot_table(columns='pos_num', index=['gameId', 'playId'], values='y_starting').rename(lambda x: x + '_y', axis=1))

In [23]:
#merging to get coords of players with _X and _Y
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [24]:
#X_col is getting all the X columns. Cols is creating a list that say "WR1_in", "FB1_in" etc
x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]

In [25]:
# Creating addition columns (boolean) for X player being in. If TE1 is in, flag says TRUE
starting_pos[cols] = starting_pos[x_col].notnull()

In [26]:
starting_pos.fillna(0, inplace=True)

In [27]:
#turning into sparse matrix
starting_pos

Unnamed: 0_level_0,pos_num,CB0_x,CB1_x,DB0_x,DB1_x,DE0_x,DL0_x,DT0_x,FB0_x,FS0_x,HB0_x,...,SS1_in,TE0_in,TE1_in,TE2_in,TE3_in,WR0_in,WR1_in,WR2_in,WR3_in,WR4_in
gameId,playId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018090600,75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,False,False,False,True,True,False,False,False
2018090600,146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,False,False,False,True,True,True,False,False
2018090600,168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,False,False,False,True,True,False,False,False
2018090600,190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,False,False,False,True,True,True,False,False
2018090600,256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.66,0.0,0.0,...,False,True,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018123015,3601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,False,False,False,True,True,True,False,False
2018123015,3794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.77,0.0,0.0,...,False,True,False,False,False,True,True,False,False,False
2018123015,3819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,True,False,False,True,True,False,False,False
2018123015,3932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,False,True,False,False,False,True,True,True,False,False


In [47]:
#Final data! Everything is getting merged together.
data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [48]:
#Aint she a beaut?
data.head()

Unnamed: 0,CB0_x,CB1_x,DB0_x,DB1_x,DE0_x,DL0_x,DT0_x,FB0_x,FS0_x,HB0_x,...,TE3_in,WR0_in,WR1_in,WR2_in,WR3_in,WR4_in,RB,TE,WR,offenseFormation
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,True,False,False,False,2.0,1.0,2.0,I_FORM
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,True,False,False,False,2.0,1.0,2.0,I_FORM
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,True,False,False,False,2.0,1.0,2.0,I_FORM
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,True,False,False,False,2.0,1.0,2.0,I_FORM
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,...,False,True,True,False,False,False,2.0,1.0,2.0,I_FORM


In [49]:
data.drop_duplicates(inplace=True)

In [50]:
data.dropna(axis=0, inplace=True)

In [51]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)



array([0.95533141, 0.95028818, 0.96106705, 0.95602019, 0.95313627])

In [55]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)



array([0.96397695, 0.96037464, 0.9668349 , 0.95529921, 0.95890411])

In [56]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.9571853537325512

In [57]:
grid_rfor.best_params_

{'max_depth': None}

In [58]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)



array([0.94740634, 0.94524496, 0.96250901, 0.95025234, 0.95097332])

In [59]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9592043068339974

In [60]:
X_test_scaled = scaler.transform(X_test)

In [62]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.89      0.97      0.93       298
      I_FORM       0.86      0.87      0.86       114
       JUMBO       1.00      0.43      0.60         7
      PISTOL       1.00      0.39      0.56        31
     SHOTGUN       0.98      0.98      0.98      1528
  SINGLEBACK       0.94      0.96      0.95       334
     WILDCAT       0.00      0.00      0.00         1

    accuracy                           0.96      2313
   macro avg       0.81      0.66      0.70      2313
weighted avg       0.96      0.96      0.96      2313

[[ 289    0    0    0    6    3    0]
 [   0   99    0    0    0   15    0]
 [   0    3    3    0    0    1    0]
 [   2    0    0   12   17    0    0]
 [  32    0    0    0 1495    1    0]
 [   0   13    0    0    0  321    0]
 [   0    0    0    0    0    1    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00       894
      I_FORM       1.00      1.00      1.00       342
       JUMBO       1.00      1.00      1.00        20
      PISTOL       1.00      1.00      1.00        92
     SHOTGUN       1.00      1.00      1.00      4581
  SINGLEBACK       1.00      1.00      1.00      1004
     WILDCAT       1.00      1.00      1.00         4

    accuracy                           1.00      6937
   macro avg       1.00      1.00      1.00      6937
weighted avg       1.00      1.00      1.00      6937

[[ 894    0    0    0    0    0    0]
 [   0  342    0    0    0    0    0]
 [   0    0   20    0    0    0    0]
 [   0    0    0   92    0    0    0]
 [   0    0    0    0 4581    0    0]
 [   0    0    0    0    0 1004    0]
 [   0    0    0    0    0    0    4]]


In [44]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6606139213143104