In [1]:
import boto3
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np

s3 = boto3.resource('s3')
bucket = s3.Bucket("nyg-hackathon-949955964069")

In [2]:
# Pull in all player statistics
teams = ['TEN', 'BAL', 'MIN', 'NO', 'TB', 'SF', 'CAR', 'NE', 'KC', 'LA', 
        'LAC', 'SEA', 'OAK', 'BUF', 'CLE', 'CIN', 'HOU', 'DEN', 'CHI', 'IND',
        'NYJ', 'MIA', 'ARI', 'DAL', 'ATL', 'JAX', 'PHI', 'WAS', 'DET', 'PIT',
        'NYG', 'GB']

dfs = []

for team in teams:
    file = 's3a://nyg-hackathon-949955964069/technicaldangerousness/SparkTables/' + team + '_playerstats.csv'
    df = pd.read_csv(file, header=0)
    dfs.append(df)
    
player_stats = pd.concat(dfs, ignore_index=True)

#### Due Diligence
Check the data outputted by GetNFLData against the ESPN official PBWR rankings from 2019. Goal is to quantitatively assess how far off I am from the NFL-wide accepted stats.
A couple reasons my dataset might be off:
- I used a significantly pared down dataset for performance reasons, sampling 3 plays per quarter of each game
- I am using a simplified version of the PBWR stat that only takes proximity into account, as I hypothesized this was the most impactful factor

In [3]:
# Data from https://www.espn.com/nfl/story/_/id/27584726/nfl-pass-blocking-pass-rushing-rankings-2019-pbwr-prwr-leaderboard
OT_top10s = ['David Bakhtiari', 'Andrew Whitworth', 'Kelvin Beachum', 'Ronnie Stanley', 'Mike McGlinchey', 'Orlando Brown', 'Alejandro Villanueva', 'Jack Conklin', 'Trent Brown', 'Taylor Lewan']
OG_top10s = ['Marshal Yanda', 'Joe Thuney', 'Richie Incognito', 'Joel Bitonio', 'Zack Martin', 'Joe Dahl', 'Quenton Nelson', 'Elgton Jenkins', 'Wyatt Teller', 'Ali Marpet']
C_top10s = ['Corey Linsley', 'JC Tretter', 'Rodney Hudson', 'Brandon Linder', 'A.Q. Shipley', 'Chase Roullier', 'Nick Martin', 'Cody Whitehair', 'Ryan Jensen', 'Ryan Kelly']

top_10_OTs = player_stats[player_stats['player'].isin(OT_top10s)].sort_values('winrate', ascending=False)
top_10_OGs = player_stats[player_stats['player'].isin(OG_top10s)].sort_values('winrate', ascending=False)
top_10_Cs = player_stats[player_stats['player'].isin(C_top10s)].sort_values('winrate', ascending=False)

top_10s = [top_10_OTs, top_10_OGs, top_10_Cs]

print('Top 10 OTs, OGs, and Cs from ESPN: ')
for top_10 in top_10s:
    print(top_10)

Top 10 OTs, OGs, and Cs from ESPN: 
                   player position observations blockwins   winrate
11         Ronnie Stanley        T          150       139  0.926667
9           Orlando Brown        T          178       161  0.904494
169  Alejandro Villanueva        T          178       150  0.842697
183       David Bakhtiari        T          156       130  0.833333
79            Trent Brown        T           95        78  0.821053
119        Kelvin Beachum        T          148       119  0.804054
53       Andrew Whitworth        T          178       141  0.792135
32        Mike McGlinchey       OT          115        87  0.756522
5            Taylor Lewan        T          117        85  0.726496
2            Jack Conklin        T          151       108  0.715232
               player position observations blockwins   winrate
7       Marshal Yanda        G          156       146  0.935897
77   Richie Incognito        G          113       103  0.911504
184    Elgton Jenkins   

<b>Data similarity criteria:</b> Heuristically defined - my values are dissimilar to ESPN's values, but it is possible that it still capture the relative skill of the linemen with respect to each other.

Reasonable to say that if most of the ESPN top 10 lineman within each position (OT, OG, C) are in the top 75% of my data, which constitute the top ~15 players in each position, we can consider this data useful as a representation of the relative skill of each lineman. 

In [4]:
# Accepts list of top 10 dataframes and percentiles, zips them together and runs a for loop over them
def check_top10_accuracy(top_10, p_75):
    above_75_count = 0
    for index, row in top_10.iterrows():
        if row['winrate'] >= p_75:
            above_75_count += 1
    
    return above_75_count

Then get the top 75th percentiles for each o-line position and run the analysis

In [5]:
above_75_count = 0

ot_PBWR = player_stats[(player_stats['position']=='T')|(player_stats['position']=='OT')]['winrate'].tolist()
og_PBWR = player_stats[(player_stats['position']=='G')|(player_stats['position']=='OG')]['winrate'].tolist()
c_PBWR = player_stats[player_stats['position']=='C']['winrate'].tolist()

p_75s = [np.percentile(ot_PBWR, 75), np.percentile(og_PBWR, 75), np.percentile(c_PBWR, 75)]

iterator = zip(top_10s, p_75s)

for item in iterator:
    above_75_count += check_top10_accuracy(item[0], item[1])
    
print(above_75_count / 30 * 100, "% of the ESPN top 10 is in my upper 75th percentile for each position")

50.0 % of the ESPN top 10 is in my upper 75th percentile for each position


50% could use some improvement, but good enough for our POC. On to the feature building:

### Feature Building:
Strategy: 
- Create a function that does the following:
    - Pull in tracking data for a team from S3
    - Reduce the tracking data to the players listed above
    - Calculate each feature (and showcase the method of each calculation clearly)
    - Append each feature to playerstats as a new column
- Run the function iteratively for each NFL team

<b>Note:</b> below code is experimental - strategy is to get the code working for one team, then package it into function and use for each team

In [6]:
# Append all feature columns to player_stats
player_stats['init_speed'] = 0
player_stats['init_accel'] = 0
player_stats['qb_distance_delta'] = 0
player_stats['contact_o_dir_angle'] = 0
player_stats['assnmt_speed_after_contact'] = 0
player_stats.set_index('player', drop=False, inplace = True)
player_stats.head()

Unnamed: 0_level_0,player,position,observations,blockwins,winrate,init_speed,init_accel,qb_distance_delta,contact_o_dir_angle,assnmt_speed_after_contact
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Ben Jones,Ben Jones,C,161,138,0.857143,0,0,0,0,0
Dennis Kelly,Dennis Kelly,OT,55,43,0.781818,0,0,0,0,0
Jack Conklin,Jack Conklin,T,151,108,0.715232,0,0,0,0,0
Nate Davis,Nate Davis,G,119,98,0.823529,0,0,0,0,0
Rodger Saffold,Rodger Saffold,G,151,125,0.827815,0,0,0,0,0


In [7]:
trackingfile = 's3a://nyg-hackathon-949955964069/technicaldangerousness/SparkTables/ARI_tracking.csv'
df = pd.read_csv(trackingfile, header=0, index_col=0)

trackingfile_defense = 's3a://nyg-hackathon-949955964069/technicaldangerousness/SparkTables/ARI_trackingdefense.csv'
df_d = pd.read_csv(trackingfile_defense, header=0, index_col=0)

### Feature 1: Explosiveness (Speed / Acceleration off the snap)
- Average speed of a player in the first moment of the play
- Average acceleration over the first second of the play

In [8]:
def get_first_speeds(df, player_stats):
    first_seconds = df.groupby(['ballsnaptime', 'player'], as_index=False).first()
    avg_first_speeds = first_seconds.groupby('player', as_index=False).mean()

    for index, row in avg_first_speeds.iterrows():
        player_stats.at[row['player'], 'init_speed'] = row['s']

    return(player_stats)
    
init_speeds = get_first_speeds(df, player_stats).dropna()

In [9]:
init_speeds[init_speeds['init_speed'] > 0]

Unnamed: 0_level_0,player,position,observations,blockwins,winrate,init_speed,init_accel,qb_distance_delta,contact_o_dir_angle,assnmt_speed_after_contact
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D.J. Humphries,D.J. Humphries,OT,177,147,0.830508,0.12526,0.0,0.0,0.0,0.0
J.R. Sweezy,J.R. Sweezy,G,170,151,0.888235,0.163132,0.0,0.0,0.0,0.0
Justin Murray,Justin Murray,OT,138,120,0.869565,0.309145,0.0,0.0,0.0,0.0
Justin Pugh,Justin Pugh,G,172,160,0.930233,0.235829,0.0,0.0,0.0,0.0


### TODO
Build remaining features:
- Initial acceleration
- Average o-lineman displacement relative to QB
- Angle between orientation and direction of movement after contact
- Average speed after contact