In [2]:
## instructions on getting data through Kaggle API. API key is read from user/'your name'/.kaggle directory on a mac.
## https://www.kaggle.com/general/74235
import os

!pip install kaggle
!kaggle competitions download -c 'nfl-big-data-bowl-2021'
import zipfile

cwd = str(os.getcwd())
with zipfile.ZipFile(cwd + '/nfl-big-data-bowl-2021.zip', 'r') as zip_ref:
    zip_ref.extractall(cwd + '/Kaggle-Data-Files')

401 - Unauthorized


In [3]:
import pandas as pd
import numpy as np
import os

dir = 'assets'
fp = dir + '/full_position.csv'
if not os.path.exists(dir):
    os.mkdir(dir)
    positions = pd.DataFrame()
    for week in range(1, 18):
        week = pd.read_csv(cwd + f'/Kaggle-Data-Files/week{week}.csv')
        positions = pd.concat([positions, week], axis=0)
    positions.to_csv(fp, index=False)
else:
    positions = pd.read_csv(fp)

In [4]:
positions.shape

(18309388, 19)

In [5]:
# reading plays (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')

In [6]:
#time to datetime
positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')

In [7]:
# Get starting position of offensive players
# I Think this should be where frame == 1, not .first in case of data quality issues.
starting_pos = positions.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()

In [8]:
# merging play data (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

In [9]:
# data cleaning where yardline is not Null
starting_pos_plays = starting_pos_plays[starting_pos_plays['absoluteYardlineNumber'].notnull()]

In [10]:
# bring in game info (see game info data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

In [11]:
#bringing in features from games
starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')

In [12]:
#naming which team has the ball as offense or defense
starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                            (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']),
                                            'offense', 'defense')

In [13]:
#starting position from offense players 
starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

In [14]:
# What personal is on the field
personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
personnel = personnel.astype(float)

In [15]:
# Adding that as a feature in the new DF
starting_off_pers = pd.concat([starting_off, personnel], axis=1)

In [16]:
# Subtracting 10 because the endzone adds 10 years to field 
starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

In [17]:
# If position X is less than yardline100, return yardline100 - starting position, else, starting position - yardline. 
# This gets # of yards behind line no matter which way they are facing.

# Y starting is the y coords of the starting position.
starting_off_pers['off_pos'] = np.where(starting_off_pers['x'].lt(starting_off_pers['absoluteYardlineNumber']), 'left', 'right')
starting_off_pers['x_behind_line'] = np.where(starting_off_pers['off_pos'] == 'right',
                                              starting_off_pers['absoluteYardlineNumber'].sub(starting_off_pers['x']),
                                              starting_off_pers['x'].sub(starting_off_pers['absoluteYardlineNumber']))
starting_off_pers['y_starting'] = np.where(starting_off_pers['off_pos'] == 'right',
                                           starting_off_pers['y'].rsub(53.3), starting_off_pers['y'])

In [18]:
def find_rank(df, col, reverse=False):
    """
    Find the ranking of a series based on values.
    :param df: Dataframe for ranking; pd.DataFrame
    :param col: Column from dataframe to rank; str
    :param reverse: Flag of whether to reverse rank direction; bool
    :return: Array with rankings; np.array
    """
    # Extract series and use arsort to find rankings.
    ser = df[col]
    temp = np.argsort(ser)

    # Reverse direction based on flag.
    if reverse:
        temp = temp[::-1]

    # Fill ranking array.
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(ser.shape[0])
    return ranks

In [20]:
# Find the order of positions based on offensive direction.
# First, group and extract first value of the y starting position and direction.
pos_start = (starting_off_pers
             .groupby(['gameId', 'playId', 'position', 'nflId'])
             [['y_starting', 'x', 'off_pos']].first()
             .reset_index())

# Next, group and extract ranking of positions based on whether team is home or away
# and the starting position.
pos_order = (pos_start.groupby(['gameId', 'playId', 'position'])
             .apply(lambda x: np.where(x['off_pos'] == 'right', find_rank(x, 'y_starting'),
                                       find_rank(x, 'y_starting', reverse=True)))
             .explode()
             .reset_index()
             )

# Add column with the position order to the df with indexed starting position.
pos_start['pos_order'] = pos_order[0]

In [21]:
# Add number of position to position label to get position number.
pos_start['pos_num'] = pos_start.apply(lambda x: x['position'] + str(x['pos_order']), axis=1)

In [22]:
#Adding a label of the players position (WR1, WR2). This makes sense from a numerical stand point, but shouldn't be used
#to classify a team's WR1 WR2 etc.

starting_off_pers = starting_off_pers.merge(pos_start[['gameId', 'playId', 'nflId', 'pos_num', 'pos_order']],
                                            on=['gameId', 'playId', 'nflId'])

In [23]:
# Convert to matrix of GameID and PlayID. Grab number of yards behind line for each player. 
starting_x = (starting_off_pers
    .pivot_table(columns='position', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1))

In [24]:
#Same as above, but for Y coords.
starting_y = (starting_off_pers
              .pivot_table(columns='position', index=['gameId', 'playId'], values='y_starting').rename(lambda x: x + '_y', axis=1))

In [25]:
#merging to get coords of players with _X and _Y
starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

In [26]:
#X_col is getting all the X columns. Cols is creating a list that say "WR1_in", "FB1_in" etc
x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
cols = [col[:3] + '_in' for col in x_col]

In [27]:
# Creating addition columns (boolean) for X player being in. If TE1 is in, flag says TRUE
starting_pos[cols] = starting_pos[x_col].notnull()

In [28]:
starting_pos.fillna(0, inplace=True)

In [29]:
#turning into sparse matrix
starting_pos

Unnamed: 0_level_0,position,CB_x,DB_x,DE_x,DL_x,DT_x,FB_x,FS_x,HB_x,ILB_x,K_x,...,LS__in,NT__in,OLB_in,P_x_in,QB__in,RB__in,S_x_in,SS__in,TE__in,WR__in
gameId,playId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018090600,75,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018090600,146,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018090600,168,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018090600,190,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018090600,256,0.0,0.0,0.0,0.0,0.0,-4.66,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018123015,3601,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018123015,3794,0.0,0.0,0.0,0.0,0.0,-4.77,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018123015,3819,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True
2018123015,3932,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,False,False,False,False,True,True,False,False,True,True


In [30]:
#Final data! Everything is getting merged together.
data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']],
                   left_index=True,
                   right_on=['gameId', 'playId']).drop(['gameId', 'playId'], axis=1)

In [31]:
#Aint she a beaut?
data.head()

Unnamed: 0,CB_x,DB_x,DE_x,DL_x,DT_x,FB_x,FS_x,HB_x,ILB_x,K_x,...,NT__in,OLB_in,P_x_in,QB__in,RB__in,S_x_in,SS__in,TE__in,WR__in,offenseFormation
0,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,True,True,False,False,True,True,I_FORM
1,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,True,True,False,False,True,True,I_FORM
2,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,True,True,False,False,True,True,I_FORM
3,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,True,True,False,False,True,True,I_FORM
4,0.0,0.0,0.0,0.0,0.0,-5.13,0.0,0.0,0.0,0.0,...,False,False,False,True,True,False,False,True,True,I_FORM


In [32]:
data.drop_duplicates(inplace=True)

In [33]:
data.dropna(axis=0, inplace=True)

In [34]:
X = data.iloc[:, :-1]
y = data['offenseFormation']

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)



array([0.94164265, 0.93804035, 0.93366979, 0.93583273, 0.92790195])

In [38]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)



array([0.95461095, 0.9574928 , 0.94881038, 0.94953136, 0.94232156])

In [39]:
from sklearn.model_selection import GridSearchCV

rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.9495452462412192

In [40]:
grid_rfor.best_params_

{'max_depth': None}

In [41]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)



array([0.92867435, 0.92146974, 0.92934391, 0.93006489, 0.93294881])

In [42]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9354187454590696

In [43]:
X_test_scaled = scaler.transform(X_test)

In [44]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_rfor.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.96      0.97      0.97       298
      I_FORM       0.86      0.66      0.75       114
       JUMBO       1.00      0.71      0.83         7
      PISTOL       1.00      0.32      0.49        31
     SHOTGUN       0.98      0.99      0.99      1528
  SINGLEBACK       0.89      0.97      0.93       334
     WILDCAT       0.00      0.00      0.00         1

    accuracy                           0.96      2313
   macro avg       0.81      0.66      0.71      2313
weighted avg       0.96      0.96      0.96      2313

[[ 290    0    0    0    6    2    0]
 [   0   75    0    0    1   38    0]
 [   0    2    5    0    0    0    0]
 [   1    1    0   10   19    0    0]
 [  12    0    0    0 1516    0    0]
 [   0    9    0    0    0  325    0]
 [   0    0    0    0    1    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
y_pred_train = grid_rfor.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       1.00      1.00      1.00       894
      I_FORM       1.00      1.00      1.00       342
       JUMBO       1.00      1.00      1.00        20
      PISTOL       1.00      1.00      1.00        92
     SHOTGUN       1.00      1.00      1.00      4581
  SINGLEBACK       1.00      1.00      1.00      1004
     WILDCAT       1.00      1.00      1.00         4

    accuracy                           1.00      6937
   macro avg       1.00      1.00      1.00      6937
weighted avg       1.00      1.00      1.00      6937

[[ 894    0    0    0    0    0    0]
 [   0  342    0    0    0    0    0]
 [   0    0   20    0    0    0    0]
 [   0    0    0   92    0    0    0]
 [   0    0    0    0 4581    0    0]
 [   0    0    0    0    0 1004    0]
 [   0    0    0    0    0    0    4]]


In [46]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6606139213143104