In [206]:
from collections import defaultdict
import json

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [207]:
teams = [
    'PHI', 'ATL', 'BUF', 'BAL', 'CLE', 'PIT', 'IND', 'CIN', 'MIA',
    'TEN', 'SF', 'MIN', 'HOU', 'NE', 'TB', 'NO', 'NYG', 'JAX', 'KC',
    'LAC', 'ARI', 'WAS', 'CAR', 'DAL', 'SEA', 'DEN', 'CHI', 'GB',
    'DET', 'NYJ', 'LA', 'OAK', 'JAC', 'SD', 'STL'
]

seasons = list(range(2009, 2019))

In [422]:
def get_next_opponent_drive(df):
    # Get the opponents next drive.
    df['next_start_yard_line'] = df['start_yard_line'].shift(-1)
    df['next_end_yard_line'] = df['end_yard_line'].shift(-1)
    df['next_offensive_team'] = df['offensive_team'].shift(-1)
    df['next_home_team'] = df['home_team'].shift(-1)
    df['next_away_team'] = df['away_team'].shift(-1)
    same_team_mask = df['offensive_team'] == df['next_offensive_team']
    new_game_mask = (df['home_team'] != df['next_home_team']) | (df['away_team'] != df['next_away_team'])
    df.loc[(same_team_mask) | (new_game_mask), 'next_start_yard_line'] = np.nan
    df.loc[(same_team_mask) | (new_game_mask), 'next_end_yard_line'] = np.nan
    df.loc[new_game_mask, 'next_offensive_team'] = np.nan
    df = df.drop(['next_home_team', 'next_away_team'], axis=1)
    return df


def bin_yard_lines(df, binned_column, prefix):
    # Bin yard lines into groups of five.
    lower = np.arange(0, 100, 10)
    upper = np.arange(10, 110, 10)
    bins_list = list(zip(lower, upper))
    bins = pd.IntervalIndex.from_tuples(bins_list)
    df['%s_yard_line_bin' % prefix] = pd.cut(df[binned_column], bins)
    df['%s_yard_line_bin' % prefix] = df['%s_yard_line_bin' % prefix].map(
        lambda x: '%s-%s' % (x.left, x.right)
    )
    return df


def add_offensive_scores(df):
    df['points'] = 0
    df.loc[df['result'] == 'Touchdown', 'points'] = 7
    return df


def subtract_defensive_scores(df):
    # Alter result of fumble and interceptions that result in defensive TD.
    int_mask = df['result'] == 'Interception'
    fumble_mask = df['result'] == 'Fumble'
    td_mask = df['last_play_desc'].str.contains('TOUCHDOWN')
    safety_mask = df['result'].isin(['Safety', 'Fumble, Safety'])
    df.loc[(int_mask) & (td_mask), 'result'] = 'Interception, Touchdown'
    df.loc[(fumble_mask) & (td_mask), 'result'] = 'Fumble, Touchdown'
    df.loc[(int_mask) & (td_mask), 'points'] = -7
    df.loc[(fumble_mask) & (td_mask), 'points'] = -7
    df.loc[safety_mask, 'points'] = -2
    return df
    

def preprocess_drives(drives):
    """Preprocess drives for analysis."""
    df = pd.DataFrame(drives)
    df['drive_id'] = df.index
    df = df.loc[~df['away_team'].isin(['APR', 'NPR', 'AFC', 'NFC', 'IRV', 'CRT'])].copy()
    
    df['total_yards'] = df['penalty_yards'] + df['yards_gained']
    df['end_yard_line'] = df['start_yard_line'] + df['total_yards']
    df = get_next_opponent_drive(df)
    
    df = bin_yard_lines(df, binned_column='start_yard_line', prefix='start')
    df = bin_yard_lines(df, binned_column='end_yard_line', prefix='end')
    df = bin_yard_lines(df, binned_column='next_start_yard_line', prefix='next_start')
    
    df = add_offensive_scores(df)
    df = subtract_defensive_scores(df)
    return df

In [423]:
df = pd.DataFrame()

for season in seasons:
    drives = json.load(open('./data/%i_drives.json' % season, 'r'))
    sdf = preprocess_drives(drives)
    sdf['season'] = season
    df = pd.concat((df, sdf))

In [424]:
def add_field_goal_points(df):
    df['made_field_goal'] = 0
    df.loc[df['result'] == 'Field Goal', 'made_field_goal'] = 1
    field_goal_mask = df['result'].isin(['Field Goal', 'Missed FG', 'Blocked FG', 'Blocked FG, Downs'])
    field_goal_agg = df.loc[field_goal_mask].groupby('end_yard_line_bin')
    df.loc[field_goal_mask, 'points'] = field_goal_agg['made_field_goal'].transform('mean') * 3
    df = df.drop('made_field_goal', axis=1)
    return df


def add_field_position_points(df):
    """Add or subtract points based on field position changes.
    
    1. Where does the average team get the ball based on your start_yard_line?
    2. How many expected points is that worth?
    3. Where does the average team get the ball based on your end_yard_line?
    4. How many expected points is that worth?
    5. What is the change in your opponents expected points on their next drive?
    """
    df['start_opp_expected_start'] = df.groupby('start_yard_line_bin')\
        ['next_start_yard_line'].transform('mean')
    df['end_opp_expected_start'] = df.groupby('end_yard_line_bin')\
        ['next_start_yard_line'].transform('mean')
    df = bin_yard_lines(
        df, binned_column='start_opp_expected_start', prefix='start_opp_expected'
    )
    df = bin_yard_lines(
        df, binned_column='end_opp_expected_start', prefix='end_opp_expected'
    )
    df['expected_points'] = df['points']
    nfl_agg = df.groupby('start_yard_line_bin')['points'].mean()
    nfl_agg = nfl_agg.to_dict()
    df['expected_points_opp_from_start'] = df['start_opp_expected_yard_line_bin'].map(nfl_agg)
    df['expected_points_opp_from_end'] = df['end_opp_expected_yard_line_bin'].map(nfl_agg)
    df['field_position_points'] = (
        df['expected_points_opp_from_start'] - df['expected_points_opp_from_end']
    )
    df['drive_score'] = df['points'] + df['field_position_points']
    drop_columns = [
        'expected_points', 
        'start_opp_expected_start', 
        'end_opp_expected_start'
    ]
    df = df.drop(drop_columns, axis=1)
    return df


def postprocess_drives(df):
    # To get decade averages run functions here.
    df = add_field_goal_points(df)
    df = add_field_position_points(df)
    return df

In [425]:
df = postprocess_drives(df)

In [426]:
df.tail(5)

Unnamed: 0,away_team,defensive_team,drive_time,end_quarter,end_time,first_play_desc,game_id,home_score_diff_last_quarter,home_team,last_play_desc,...,end_yard_line_bin,next_start_yard_line_bin,points,season,start_opp_expected_yard_line_bin,end_opp_expected_yard_line_bin,expected_points_opp_from_start,expected_points_opp_from_end,field_position_points,drive_score
5760,NE,LA,2:49,4,07:00,(9:49) T.Brady pass short right to R.Gronkowsk...,2019020300,0,LA,"S.Gostkowski extra point is GOOD, Center-J.Car...",...,90-100,20-30,7.0,2018,20-30,20-30,1.626964,1.626964,0.0,7.0
5761,NE,NE,2:43,4,04:17,S.Gostkowski kicks 65 yards from NE 35 to end ...,2019020300,0,LA,(4:24) (Shotgun) J.Goff pass deep right intend...,...,70-80,0-10,0.0,2018,20-30,20-30,1.626964,1.626964,0.0,0.0
5762,NE,LA,3:05,4,01:12,(4:17) S.Michel left guard to NE 5 for 1 yard ...,2019020300,0,LA,(1:16) S.Gostkowski 41 yard field goal is GOOD...,...,70-80,20-30,2.41806,2018,30-40,20-30,1.902903,1.626964,0.275939,2.693999
5763,NE,NE,1:07,4,00:05,S.Gostkowski kicks 65 yards from NE 35 to end ...,2019020300,0,LA,(:08) G.Zuerlein 48 yard field goal is No Good...,...,60-70,30-40,1.976094,2018,20-30,20-30,1.626964,1.626964,0.0,1.976094
5764,NE,LA,0:05,4,00:00,(:05) T.Brady kneels to NE 37 for -1 yards.,2019020300,0,LA,END GAME,...,30-40,,0.0,2018,20-30,20-30,1.626964,1.626964,0.0,0.0


In [427]:
all_agg = df.groupby('start_yard_line_bin').agg({'drive_score': 'mean'})

all_agg.head()

Unnamed: 0_level_0,drive_score
start_yard_line_bin,Unnamed: 1_level_1
0-10,0.952955
10-20,1.433972
20-30,1.541456
30-40,1.888022
40-50,2.238817


In [428]:
team_seasons = []

for team in teams:
    for season in seasons:
        team_seasons.append((team, season))

In [429]:
team = 'KC'
season = 2018

In [430]:
team_df = df.loc[(df['offensive_team'] == team) & (df['season'] == season)].copy()
team_agg = team_df.groupby('start_yard_line_bin').agg({'drive_id': 'count', 'drive_score': 'mean'})
team_agg = team_agg.rename({'drive_id': 'n_team_drives'}, axis=1)
comp_agg = all_agg.join(team_agg, lsuffix='_NFL', rsuffix='_%s' % team)
# Divide n drives from each bin by n total drives.
comp_agg['weight'] = comp_agg['n_team_drives'] / comp_agg['n_team_drives'].sum()
comp_agg['performance_score'] = (
    (comp_agg['drive_score_%s' % team] - comp_agg['drive_score_NFL']) * comp_agg['weight']
)

comp_agg

Unnamed: 0_level_0,drive_score_NFL,n_team_drives,drive_score_KC,weight,performance_score
start_yard_line_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0-10,0.952955,17,2.872697,0.093923,0.180307
10-20,1.433972,31,2.247312,0.171271,0.139301
20-30,1.541456,77,3.057195,0.425414,0.644817
30-40,1.888022,22,3.868041,0.121547,0.240665
40-50,2.238817,12,2.33152,0.066298,0.006146
50-60,2.447644,6,1.625043,0.033149,-0.027269
60-70,3.281765,7,3.345437,0.038674,0.002462
70-80,3.831773,5,6.083612,0.027624,0.062206
80-90,4.268313,1,7.0,0.005525,0.015092
90-100,5.306655,3,4.666667,0.016575,-0.010608


In [431]:
kco18_score = comp_agg['performance_score'].sum()

print('Chiefs offensive score in 2018 = %f' % kco18_score)

Chiefs offensive score in 2018 = 1.253121


### To do:
* Adjust for strength of schedule
* Will need a rolling calculation to train a model -- leakage fine for analysis

In [432]:
df.columns

Index(['away_team', 'defensive_team', 'drive_time', 'end_quarter', 'end_time',
       'first_play_desc', 'game_id', 'home_score_diff_last_quarter',
       'home_team', 'last_play_desc', 'n_plays', 'offensive_team',
       'penalty_yards', 'result', 'start_quarter', 'start_time',
       'start_yard_line', 'yards_gained', 'drive_id', 'total_yards',
       'end_yard_line', 'next_start_yard_line', 'next_end_yard_line',
       'next_offensive_team', 'start_yard_line_bin', 'end_yard_line_bin',
       'next_start_yard_line_bin', 'points', 'season',
       'start_opp_expected_yard_line_bin', 'end_opp_expected_yard_line_bin',
       'expected_points_opp_from_start', 'expected_points_opp_from_end',
       'field_position_points', 'drive_score'],
      dtype='object')

In [433]:
df['nfl_avg_score'] = df.groupby('start_yard_line_bin')['drive_score'].transform('mean')
df['drive_score'] = df['drive_score'] - df['nfl_avg_score']

In [434]:
gdf = df.groupby(['game_id', 'offensive_team', 'season'], as_index=False)['drive_score'].mean()

In [435]:
godf = df.groupby(['game_id', 'offensive_team', 'defensive_team', 'season'], as_index=False)['drive_score'].mean()
godf = godf.sort_values('drive_score', ascending=False)
godf.head(10)

Unnamed: 0,game_id,offensive_team,defensive_team,season,drive_score
2381,2013111011,NO,DAL,2013,3.751371
1241,2011102311,NO,IND,2011,3.594897
2724,2014092800,BAL,CAR,2014,3.464063
2727,2014092801,GB,CHI,2014,3.353756
3239,2015092706,NE,JAC,2015,3.175624
4198,2017012200,ATL,GB,2016,3.168133
2504,2013120900,CHI,DAL,2013,3.090391
181,2009102500,CIN,CHI,2009,2.977407
4925,2018102105,KC,CIN,2018,2.945671
1541,2012010111,NO,CAR,2011,2.935815


In [436]:
sodf = godf.groupby(['season', 'offensive_team'], as_index=False)['drive_score'].mean()
sodf = sodf.sort_values('drive_score', ascending=False)
sodf.head()

Unnamed: 0,season,offensive_team,drive_score
83,2011,NO,1.382731
306,2018,KC,1.317282
227,2016,ATL,1.248747
75,2011,GB,1.059814
114,2012,NE,1.022916


In [437]:
sodf.loc[sodf['season'] == 2018].head()

Unnamed: 0,season,offensive_team,drive_score
306,2018,KC,1.317282
312,2018,NO,0.911086
307,2018,LA,0.769429
317,2018,PIT,0.61923
292,2018,ATL,0.578407


In [438]:
gddf = df.groupby(['game_id', 'defensive_team', 'offensive_team', 'season'], as_index=False)['drive_score'].mean()
gddf = gddf.sort_values('drive_score')
gddf.head()

Unnamed: 0,game_id,defensive_team,offensive_team,season,drive_score
1162,2011100214,BAL,NYJ,2011,-3.018868
1234,2011102308,KC,OAK,2011,-2.761274
4379,2017102201,CHI,CAR,2017,-2.740749
3107,2014122800,CAR,ATL,2014,-2.605675
118,2009100411,SF,STL,2009,-2.583042


In [439]:
sddf = gddf.groupby(['season', 'defensive_team'], as_index=False)['drive_score'].mean()
sddf = sddf.sort_values('drive_score')
sddf.head()

Unnamed: 0,season,defensive_team,drive_score
273,2017,JAX,-0.661317
56,2010,PIT,-0.64602
156,2013,SEA,-0.64467
21,2009,NYJ,-0.634608
101,2012,CHI,-0.633679


In [440]:
sddf.loc[sddf['season'] == 2018].head()

Unnamed: 0,season,defensive_team,drive_score
296,2018,CHI,-0.514197
293,2018,BAL,-0.331437
310,2018,MIN,-0.211008
321,2018,TEN,-0.189003
303,2018,HOU,-0.121742


In [441]:
DEFENSE_DICT = defaultdict(dict)

for season, team, score in sddf.values:
    DEFENSE_DICT[season][team] = score

In [442]:
OFFENSE_DICT = defaultdict(dict)

for season, team, score in sodf.values:
    OFFENSE_DICT[season][team] = score

In [443]:
def get_defense_average(row):
    # Get the defenses season average.
    return DEFENSE_DICT[row['season']][row['defensive_team']]


def get_offense_average(row):
    # Get the defenses season average.
    return OFFENSE_DICT[row['season']][row['offensive_team']]


In [444]:
godf['defensive_adjustment'] = godf.apply(get_defense_average, axis=1)
gddf['offensive_adjustment'] = gddf.apply(get_offense_average, axis=1)

In [445]:
godf['adjusted_score'] = godf['drive_score'] - godf['defensive_adjustment']
gddf['adjusted_score'] = gddf['drive_score'] - gddf['offensive_adjustment']

In [446]:
asodf = godf.groupby(['season', 'offensive_team'], as_index=False)['adjusted_score'].mean()
asodf = asodf.sort_values('adjusted_score', ascending=False)

asddf = gddf.groupby(['season', 'defensive_team'], as_index=False)['adjusted_score'].mean()
asddf = asddf.sort_values('adjusted_score')

In [447]:
sodf.head()

Unnamed: 0,season,offensive_team,drive_score
83,2011,NO,1.382731
306,2018,KC,1.317282
227,2016,ATL,1.248747
75,2011,GB,1.059814
114,2012,NE,1.022916


In [448]:
asodf.head()

Unnamed: 0,season,offensive_team,adjusted_score
83,2011,NO,1.238181
306,2018,KC,1.118112
227,2016,ATL,1.097977
114,2012,NE,1.049674
50,2010,NE,1.020202


In [449]:
sddf.head()

Unnamed: 0,season,defensive_team,drive_score
273,2017,JAX,-0.661317
56,2010,PIT,-0.64602
156,2013,SEA,-0.64467
21,2009,NYJ,-0.634608
101,2012,CHI,-0.633679


In [450]:
asddf.head()

Unnamed: 0,season,defensive_team,adjusted_score
156,2013,SEA,-0.751283
293,2018,BAL,-0.6747
101,2012,CHI,-0.667197
235,2016,DEN,-0.661721
203,2015,DEN,-0.658616


In [451]:
asodf.loc[asodf['season'] == 2018]

Unnamed: 0,season,offensive_team,adjusted_score
306,2018,KC,1.118112
312,2018,NO,0.578786
307,2018,LA,0.555546
311,2018,NE,0.439922
317,2018,PIT,0.354853
308,2018,LAC,0.343494
304,2018,IND,0.294917
292,2018,ATL,0.290321
318,2018,SEA,0.21062
295,2018,CAR,0.177462


In [452]:
asddf.loc[asddf['season'] == 2018]

Unnamed: 0,season,defensive_team,adjusted_score
293,2018,BAL,-0.6747
296,2018,CHI,-0.575277
310,2018,MIN,-0.346401
317,2018,PIT,-0.277413
305,2018,JAX,-0.25028
298,2018,CLE,-0.221926
321,2018,TEN,-0.21241
300,2018,DEN,-0.200664
311,2018,NE,-0.148291
303,2018,HOU,-0.107941
