In [3]:
import pandas as pd
import numpy as np
from get_data import get_assets, get_positional_data

get_assets()
positions = get_positional_data()


This process will pip install Kaggle and download data through Kaggle API.

Please confirm that you've downloaded Kaggle JSON credentials into directory

Data Successfully Downloaded


In [47]:
def clean_positional(positions, first = 1, last = 17):
    # reading plays (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
    plays = pd.read_csv('nfl-big-data-bowl-2021/plays.csv')
    games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')
    
    #to_datetime
    positions['time'] = pd.to_datetime(positions['time'], format='%Y-%m-%dT%H:%M:%S')
    #print(positions.columns)

    if (first != 1) or (last != 17):
        week_game_id = list(games[games['week'].isin(np.arange(first,last+1))]['gameId'].drop_duplicates())
        positions = positions[positions['gameId'].isin(week_game_id)]

    #get frame id of snap for each game and play id
    snap_frames = positions[positions['event'] == 'ball_snap'][['gameId','playId','frameId']]

    #get frame prior (unless snapped on frame 1)
    snap_frames['presnapId'] = snap_frames['frameId'].apply(lambda x: int(x)-1 if x>1 else x)

    #merge to remove all non frame snap -1 data
    presnap_df = positions.merge(snap_frames[['gameId','playId','presnapId']], left_on= ['gameId','playId','frameId'], right_on=['gameId','playId','presnapId'], how = 'right')
    
    # Get starting position of offensive players
    starting_pos = presnap_df.groupby(['gameId', 'playId', 'position', 'nflId', 'team'])[['x', 'y']].first().reset_index()
    
    # merging play data (see play data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
    starting_pos_plays = starting_pos.merge(plays, on=['gameId', 'playId'], how='left')

    # data cleaning where yardline is not Null
    starting_pos_plays = starting_pos_plays[starting_pos_plays['absoluteYardlineNumber'].notnull()]
    # bring in game info (see game info data https://www.kaggle.com/c/nfl-big-data-bowl-2021/data)
    games = pd.read_csv('nfl-big-data-bowl-2021/games.csv')

    #bringing in features from games
    starting_pos_play_game = starting_pos_plays.merge(games, on='gameId', how='left')
    #naming which team has the ball as offense or defense
    starting_pos_play_game['offdef'] = np.where((starting_pos_play_game['team'] == 'away') &
                                                (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['visitorTeamAbbr']) | (starting_pos_play_game['team'] == 'home') &
                                                (starting_pos_play_game['possessionTeam'] == starting_pos_play_game['homeTeamAbbr']),
                                                'offense', 'defense')

    #starting position from offense players 
    starting_off = starting_pos_play_game[starting_pos_play_game['offdef'] == 'offense']

    # What personal is on the field
    personnel = starting_off['personnelO'].str.extract('(?P<RB>\d+)\sRB\,\s(?P<TE>\d+)\sTE\,\s(?P<WR>\d+)\sWR')
    personnel = personnel.astype(float)

    # Adding that as a feature in the new DF
    starting_off_pers = pd.concat([starting_off, personnel], axis=1)

    # Subtracting 10 because the endzone adds 10 years to field 
    starting_off_pers['yardline_100'] = starting_off_pers['absoluteYardlineNumber'].sub(10)

    # If position X is less than yardline100, return yardline100 - starting position, else, starting position - yardline. 
    # This gets # of yards behind line no matter which way they are facing.

    # Y starting is the y coords of the starting position.
    starting_off_pers['off_pos'] = np.where(starting_off_pers['x'].lt(starting_off_pers['absoluteYardlineNumber']), 'left', 'right')
    starting_off_pers['x_behind_line'] = np.where(starting_off_pers['off_pos'] == 'right',
                                                starting_off_pers['absoluteYardlineNumber'].sub(starting_off_pers['x']),
                                                starting_off_pers['x'].sub(starting_off_pers['absoluteYardlineNumber']))
    starting_off_pers['y_starting'] = np.where(starting_off_pers['off_pos'] == 'right',
                                            starting_off_pers['y'].rsub(53.3), starting_off_pers['y'])

    # Y QB is the y starting position of the quarterback.
    starting_off_pers['y_qb'] = starting_off_pers.groupby(['gameId', 'playId']).apply(lambda x: np.repeat(53.3/2, x.shape[0])
        if x[x['position'] == 'QB'].shape[0] == 0 else np.repeat(x[x['position'] == 'QB']['y_starting'].iloc[0], x.shape[0])).explode().values
    starting_off_pers['y_qb'] = starting_off_pers['y_qb'].astype(float)

    # Find side of player relative to QB and the starting y coordinates relative to the QB.
    starting_off_pers['qb_side'] = np.where(starting_off_pers['y_starting'].gt(starting_off_pers['y_qb']), 'R', 'L')
    starting_off_pers['y_starting_qb'] = starting_off_pers['y_starting'].sub(starting_off_pers['y_qb'])

    def find_rank(df, col, reverse=False):
        """
        Find the ranking of a series based on values.
        :param df: Dataframe for ranking; pd.DataFrame
        :param col: Column from dataframe to rank; str
        :param reverse: Flag of whether to reverse rank direction; bool
        :return: Array with rankings; np.array
        """
        # Extract series and use arsort to find rankings.
        ser = df[col]
        temp = np.argsort(ser)

        # Reverse direction based on flag.
        if reverse:
            temp = temp[::-1]

        # Fill ranking array.
        ranks = np.empty_like(temp)
        ranks[temp] = np.arange(ser.shape[0])
        return ranks

    # Find the order of positions based on offensive direction.
    # First, group and extract first value of the y starting position and direction.
    pos_start = (starting_off_pers
                .groupby(['gameId', 'playId', 'position', 'nflId'])
                [['y_starting', 'x', 'off_pos', 'qb_side']].first()
                .reset_index())

    # Next, group and extract ranking of positions based on whether team is home or away
    # and the starting position.
    pos_order = np.where(pos_start['position'] != 'QB',
                         (pos_start.groupby(['gameId', 'playId', 'position', 'qb_side'])
                          .apply(lambda x: np.where(x.index.get_level_values(-1) == 'R',
                                                    find_rank(x, 'y_starting'),
                                                    find_rank(x, 'y_starting', reverse=True)))
                          .explode()
                          .values
                ),
                         (pos_start.groupby(['gameId', 'playId', 'position'])
                          .apply(lambda x: find_rank(x, 'y_starting'))
                          .explode()
                          .values
                          )
                         )

    # Add column with the position order to the df with indexed starting position.
    pos_start['pos_order'] = pos_order

    # Add number of position to position label to get position number.
    pos_start['pos_num'] = np.where(pos_start['position'] != 'QB',
                                    pos_start['position'].add(pos_start['qb_side']).add(pos_start['pos_order'].astype(str)),
                                    pos_start['position'].add(pos_start['pos_order'].astype(str)))

    #Adding a label of the players position (WR1, WR2). This makes sense from a numerical stand point, but shouldn't be used
    #to classify a team's WR1 WR2 etc.

    starting_off_pers = starting_off_pers.merge(pos_start[['gameId', 'playId', 'nflId', 'pos_num', 'pos_order']],
                                                on=['gameId', 'playId', 'nflId'])

    # Convert to matrix of GameID and PlayID. Grab number of yards behind line for each player. 
    starting_x = (starting_off_pers
        .pivot_table(columns='pos_num', index=['gameId', 'playId'], values='x_behind_line').rename(lambda x: x + '_x', axis=1))

    #Same as above, but for Y coords relative to the QB.
    starting_y = (starting_off_pers
                .pivot_table(columns='pos_num', index=['gameId', 'playId'], values='y_starting_qb').rename(lambda x: x + '_y', axis=1))

    #merging to get coords of players with _X and _Y
    starting_pos = starting_x.merge(starting_y, left_index=True, right_index=True)

    #X_col is getting all the X columns. Cols is creating a list that say "WR1_in", "FB1_in" etc
    x_col = starting_pos.columns[starting_pos.columns.str.match('.*\_x$')]
    cols = [col[:4] + '_in' for col in x_col]

    # Creating addition columns (boolean) for X player being in. If TE1 is in, flag says TRUE
    starting_pos[cols] = starting_pos[x_col].notnull()

    #Sparse Matrix
    starting_pos.fillna(0, inplace=True)

    #Final data! Everything is getting merged together.
    data = starting_pos.merge(starting_off_pers[['gameId', 'playId', 'offenseFormation']].drop_duplicates(),
                    left_index=True,
                    right_on=['gameId', 'playId'])

    data.dropna(axis=0, inplace=True)
    data = data.loc[:, ~np.all(data == 0, axis=0)]

    return data



In [48]:
# Will need to determine what gameIDs constitute our train/test sets. Probably just week 1-14 train 15-17 test. 
#  #train = clean_positional(positions, first = 1 , last = 14)
#  #test = clean_positional(positions, first = 15, last = 17)

data = clean_positional(positions)

In [49]:
data

Unnamed: 0,CBL0_x,CBR0_x,DEL0_x,DER0_x,DLL0_x,DLR0_x,DTL0_x,DTL1_x,DTR0_x,FBL0_x,...,WRL1_in,WRL2_in,WRL3_in,WRR0_in,WRR1_in,WRR2_in,WRR3_in,gameId,playId,offenseFormation
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.12,...,True,False,False,False,False,False,False,2018090600,75,I_FORM
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,True,True,False,False,2018090600,146,SINGLEBACK
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,True,False,False,False,2018090600,168,SHOTGUN
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,True,True,True,False,2018090600,190,SHOTGUN
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,False,False,False,False,2018090600,256,SHOTGUN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,True,False,False,False,2018123015,3819,SINGLEBACK
111225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,False,False,False,True,True,False,False,2018123015,3932,SHOTGUN
111231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,True,True,False,False,False,False,False,2018123015,3969,SHOTGUN
111237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,True,False,False,True,False,False,False,2018123015,4057,SHOTGUN


In [43]:
data.columns

Index(['gameId', 'playId', 'position', 'nflId', 'team', 'x', 'y',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'playType', 'yardlineSide', 'yardlineNumber', 'offenseFormation',
       'personnelO', 'defendersInTheBox', 'numberOfPassRushers', 'personnelD',
       'typeDropback', 'preSnapVisitorScore', 'preSnapHomeScore', 'gameClock',
       'absoluteYardlineNumber', 'penaltyCodes', 'penaltyJerseyNumbers',
       'passResult', 'offensePlayResult', 'playResult', 'epa', 'isDefensivePI',
       'gameDate', 'gameTimeEastern', 'homeTeamAbbr', 'visitorTeamAbbr',
       'week', 'offdef'],
      dtype='object')

In [52]:
train = clean_positional(positions, first = 1 , last = 14)
test = clean_positional(positions, first = 15, last = 17)

In [53]:
#setting missing columns as all zeroes for Test set
for missing_col in set(train.columns).difference(test.columns):
    test[missing_col] = 0

In [54]:
X_train = train.drop('offenseFormation', axis = 1)
y_train = train['offenseFormation']

X_test = test.drop('offenseFormation', axis = 1)[X_train.columns]
y_test = test['offenseFormation']

In [346]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [55]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression(max_iter=10000)
cross_val_score(log_reg, X_train_scaled, y_train, cv=5)

array([0.95842956, 0.96073903, 0.95645002, 0.96865721, 0.9630363 ])

In [57]:
from sklearn.model_selection import GridSearchCV

params_lr = {'C': [10**x for x in range(-4, 4)]}
grid_lr = GridSearchCV(log_reg, params_lr, cv=3, scoring='f1_micro')
grid_lr.fit(X_train_scaled, y_train)
grid_lr.best_score_

0.9613963829169445

In [58]:
grid_lr.best_params_

{'C': 10}

In [59]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
cross_val_score(rfor, X_train_scaled, y_train, cv=5)

array([0.96931706, 0.96601782, 0.96337842, 0.97987463, 0.97458746])

In [60]:
rfor = RandomForestClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5))}
grid_rfor = GridSearchCV(rfor, param_grid=params, cv=3, scoring='f1_micro')
grid_rfor.fit(X_train_scaled, y_train)
grid_rfor.best_score_

0.9661477997009024

In [61]:
grid_rfor.best_params_

{'max_depth': None}

In [62]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
cross_val_score(dtree, X_train_scaled, y_train)

array([0.68624216, 0.96667766, 0.7743319 , 0.97228637, 0.88415842])

In [63]:
dtree = DecisionTreeClassifier(random_state=0)
params = {'max_depth': [None] + list(range(1, 5)), 'min_samples_split': range(2, 10)}
grid_dtree = GridSearchCV(dtree, param_grid=params, cv=3, scoring='f1_micro')
grid_dtree.fit(X_train_scaled, y_train)
grid_dtree.best_score_

0.9381682008464832

In [64]:
X_test_scaled = scaler.transform(X_test)

In [65]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = grid_lr.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       EMPTY       0.94      0.95      0.95       454
      I_FORM       0.94      0.86      0.90       176
       JUMBO       0.77      1.00      0.87        10
      PISTOL       1.00      0.39      0.56        57
     SHOTGUN       0.97      0.99      0.98      2172
  SINGLEBACK       0.95      0.98      0.96       459
     WILDCAT       1.00      1.00      1.00         1

    accuracy                           0.96      3329
   macro avg       0.94      0.88      0.89      3329
weighted avg       0.96      0.96      0.96      3329

[[ 430    0    0    0   23    1    0]
 [   0  152    3    0    0   21    0]
 [   0    0   10    0    0    0    0]
 [   1    0    0   22   33    1    0]
 [  25    0    0    0 2144    3    0]
 [   0   10    0    0    0  449    0]
 [   0    0    0    0    0    0    1]]


In [66]:
y_pred_train = grid_lr.predict(X_train_scaled)
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

              precision    recall  f1-score   support

       EMPTY       0.96      0.94      0.95      1894
      I_FORM       0.93      0.89      0.91       710
       JUMBO       0.97      0.97      0.97        39
      PISTOL       0.91      0.38      0.54       183
     SHOTGUN       0.98      0.99      0.99     10047
  SINGLEBACK       0.96      0.98      0.97      2247
     WILDCAT       1.00      0.97      0.99        34

    accuracy                           0.97     15154
   macro avg       0.96      0.88      0.90     15154
weighted avg       0.97      0.97      0.97     15154

[[1787    0    0    1  102    4    0]
 [   0  635    0    0    0   75    0]
 [   0    0   38    0    0    1    0]
 [   7    0    0   70  106    0    0]
 [  76    0    0    6 9964    1    0]
 [   0   47    1    0    0 2199    0]
 [   0    0    0    0    0    1   33]]


In [67]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)
np.mean(y_test == y_pred_dummy)

0.6524481826374287