***Interesting note: Chiefs v Jags last year was one of Chiefs best defensive, and Jacksonville's worst offensive, games. With Jax O improved, the game could be much closer than last year. Good early season test to see if the defense can hold up against what should be a mediocre offense. If the D doesn't perform this year, the O may need to play Jax better than last year to keep the game comfortable.***

***Were the Chiefs more consistently good on O last year than other good Os? Were they more consistent than other historically good teams? Is consistency one year a predictor of success the next year?***

***How did the top 3 offenses do in the following season?***

***How did second year starters do compared to their first year?***

In [1]:
import numpy as np
import pandas as pd

from score_teams import score_drives
from score_teams import aggregate_game_drives, aggregate_season_drives

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
start_season = 2009
end_season = 2018

In [None]:
ddf = score_drives(start_season, end_season, exclude_playoffs=True, exclude_blowouts=14)

ddf.head()

In [None]:
pdf = ddf.groupby(['offensive_team', 'season'], as_index=False).agg(
    {'drive_id': 'count', 'offensive_points': 'sum', 'drive_time': 'sum', 
     'game_id': 'nunique', 'is_touchdown': 'sum', 'expected_points': 'sum',
     'drive_score': 'mean', 'start_yard_line': 'mean'}
)

rename_dict = {
    'offensive_team': 'team',
    'drive_id': 'drive_count',
    'game_id': 'game_count',
    'is_touchdown': 'offensive_tds',
}
pdf = pdf.rename(rename_dict, axis=1)

pdf['points_per_game'] = pdf['offensive_points'] / pdf['game_count']
pdf['epoints_per_game'] = pdf['expected_points'] / pdf['game_count']
pdf['points_per_drive'] = pdf['offensive_points'] / pdf['drive_count']
pdf['epoints_per_drive'] = pdf['expected_points'] / pdf['drive_count']
pdf['td_rate'] = pdf['offensive_tds'] / pdf['drive_count']
pdf = pdf.sort_values('points_per_game', ascending=False)

pdf = pdf.reset_index(drop=True)
pdf.head(5)

In [None]:
pdf = pdf.sort_values('points_per_drive', ascending=False)

pdf.head(5)

In [None]:
pdf = pdf.sort_values('drive_score', ascending=False)

pdf = pdf.reset_index(drop=True)
pdf.head(5)

In [None]:
pdf.describe([.1, .25, .5, .75, .9])

In [None]:
pdf.corr()

In [None]:
td_rate = ddf.groupby(['season', 'offensive_team'], as_index=False)['is_touchdown'].mean()
td_rate = td_rate.sort_values('is_touchdown', ascending=False)

td_rate.head(20)

In [None]:
td_rate['is_touchdown'].describe([.05, .25, .5, .75, .95])

In [None]:
score_rate = ddf.groupby(['season', 'offensive_team'], as_index=False)['is_score'].mean()
score_rate = score_rate.sort_values('is_score', ascending=False)

score_rate.head(20)

In [None]:
score_rate['is_score'].describe([.05, .25, .5, .75, .95])

In [None]:
rddf = ddf.loc[ddf['start_quarter'] < 5]

poss_time = rddf.groupby(['season', 'offensive_team', 'game_id'], as_index=False).agg({'drive_time': 'sum'})
poss_time = poss_time.groupby(['season', 'offensive_team'], as_index=False)['drive_time'].mean()
poss_time = poss_time.sort_values('drive_time')

poss_time.head(10)

In [None]:
poss_time.loc[(poss_time['season'] == 2018)]

In [None]:
poss_time['drive_time'].describe([.05, .1, .2, .25, .5, .75, .95])

In [None]:
close_game_mask = np.abs(ddf['home_final_score'] - ddf['away_final_score']) <= 7
cddf = ddf.loc[close_game_mask].copy()

cposs_time = cddf.groupby(['season', 'offensive_team', 'game_id'], as_index=False).agg({'drive_time': 'sum'})
cposs_time = cposs_time.groupby(['season', 'offensive_team'], as_index=False)['drive_time'].mean()
cposs_time = cposs_time.sort_values('drive_time')

cposs_time.head(10)

In [None]:
cposs_time.loc[(cposs_time['season'] == 2018) & (cposs_time['offensive_team'] == 'KC')]

In [None]:
cposs_time['drive_time'].describe([.05, .25, .5, .75, .95])

In [None]:
ctdagg = cddf.groupby(['season', 'offensive_team'], as_index=False)['is_touchdown'].mean()
ctdagg = ctdagg.sort_values('is_touchdown', ascending=False)

ctdagg.head()

In [None]:
cepagg = cddf.groupby(['season', 'offensive_team'], as_index=False)['drive_score'].mean()
cepagg = cepagg.sort_values('drive_score', ascending=False)

cepagg.head()

In [None]:
tdf = ddf.loc[ddf['is_touchdown'] == 1].copy()
aggtdf = tdf.groupby(['season', 'offensive_team'], as_index=False)['drive_time'].mean()

aggtdf = aggtdf.sort_values('drive_time', ascending=False)

aggtdf.head()

In [None]:
tdf.loc[(tdf['season'] == 2018) & (tdf['offensive_team'] == 'KC'), 'drive_time'].describe([.05, .25, .5, .75, .95])

In [None]:
kcdd = ddf.loc[(ddf['defensive_team'] == 'KC') & (ddf['season'] == 2018)]
kcdd_agg = kcdd.groupby(['defensive_team', 'game_id'], as_index=False).agg(
    {'drive_time': 'mean', 'drive_id': 'count', 'defensive_win': 'mean',
     'drive_score': 'mean'}
)

In [None]:
kcdd_agg

In [10]:
gdf = aggregate_game_drives(2009, 2018, 'offensive_team', exclude_playoffs=True, exclude_blowouts=14)
gdf = gdf.sort_values('game_id')

gdf.head(10)

Unnamed: 0,game_id,offensive_team,defensive_team,home_team,away_team,season,drive_score,home_final_score,away_final_score,offensive_win,defensive_win,tie,is_touchdown,possession_time,drive_count,avg_drive_time,adj_offensive_score,adj_defensive_score
0,2009091000,PIT,TEN,PIT,TEN,2009,-0.826254,13,10,1,0,0,0.076923,36.183333,13,2.783333,-1.238945,-0.94849
1,2009091000,TEN,PIT,PIT,TEN,2009,-0.300776,13,10,0,1,0,0.083333,28.35,12,2.3625,0.088645,-0.410646
3,2009091300,MIA,ATL,ATL,MIA,2009,-1.483729,19,7,0,1,0,0.0,24.066667,9,2.674074,-1.566343,-1.426075
2,2009091300,ATL,MIA,ATL,MIA,2009,-0.360881,19,7,1,0,0,0.2,26.966667,10,2.696667,-0.515937,-0.646442
4,2009091301,BAL,KC,BAL,KC,2009,1.795222,38,24,1,0,0,0.454545,39.816667,11,3.619697,1.602561,1.351691
5,2009091301,KC,BAL,BAL,KC,2009,-0.374493,38,24,0,1,0,0.2,19.666667,10,1.966667,-0.029261,0.24152
6,2009091302,CAR,PHI,CAR,PHI,2009,-1.553423,10,38,0,1,0,0.090909,22.0,11,2.0,-1.229064,-1.307677
7,2009091302,PHI,CAR,CAR,PHI,2009,0.228135,10,38,1,0,0,0.333333,23.133333,9,2.57037,0.559528,-0.192517
9,2009091303,DEN,CIN,CIN,DEN,2009,-0.502151,7,12,1,0,0,0.090909,26.55,11,2.413636,-0.123606,-0.318718
8,2009091303,CIN,DEN,CIN,DEN,2009,-1.001702,7,12,0,1,0,0.090909,33.45,11,3.040909,-0.75307,-0.906264


In [11]:
kcddf = gdf.loc[(gdf['defensive_team'] == 'KC') & (gdf['season'] == 2018)]

kcddf

Unnamed: 0,game_id,offensive_team,defensive_team,home_team,away_team,season,drive_score,home_final_score,away_final_score,offensive_win,defensive_win,tie,is_touchdown,possession_time,drive_count,avg_drive_time,adj_offensive_score,adj_defensive_score
4627,2018090908,LAC,KC,LAC,KC,2018,0.816833,28,38,0,1,0,0.2,29.75,10,2.975,0.254562,0.202334
4653,2018091605,PIT,KC,PIT,KC,2018,2.24037,37,42,0,1,0,0.5,30.716667,10,3.071667,1.678099,1.514541
4685,2018092305,SF,KC,KC,SF,2018,1.384325,38,27,0,1,0,0.375,21.483333,8,2.685417,0.822054,1.379673
4732,2018100100,DEN,KC,DEN,KC,2018,0.609214,23,27,0,1,0,0.2,24.966667,10,2.496667,0.046943,0.767565
4746,2018100705,JAX,KC,KC,JAX,2018,-1.588724,30,14,0,1,0,0.125,17.4,8,2.175,-2.150995,-1.015407
4791,2018101412,NE,KC,NE,KC,2018,1.444874,43,40,1,0,0,0.333333,36.15,12,3.0125,0.882603,0.765952
4806,2018102105,CIN,KC,KC,CIN,2018,-1.33619,45,10,0,1,0,0.111111,21.033333,9,2.337037,-1.898461,-1.390243
4834,2018102805,DEN,KC,KC,DEN,2018,0.572449,30,23,0,1,0,0.25,32.683333,12,2.723611,0.010178,0.7308
4858,2018110403,CLE,KC,CLE,KC,2018,2.293291,21,37,0,1,0,0.5,28.033333,6,4.672222,1.73102,2.201609
4888,2018111105,ARI,KC,KC,ARI,2018,-0.297063,26,14,0,1,0,0.181818,31.783333,11,2.889394,-0.859334,0.598369


In [None]:
kcodf = gdf.loc[(gdf['offensive_team'] == 'KC') & (gdf['season'] == 2018)]
kcodf

In [None]:
sodf = gdf.groupby(['offensive_team', 'season'], as_index=False)['adj_offensive_score'].mean()
sodf = sodf.sort_values('adj_offensive_score', ascending=False)

sodf.loc[sodf['season'] == 2018].head()

In [None]:
sddf = gdf.groupby(['defensive_team', 'season'], as_index=False)['adj_defensive_score'].mean()
sddf = sddf.sort_values('adj_defensive_score', ascending=False)

sddf.loc[sddf['season'] == 2018].head()

In [None]:
# Adjusted performance splits.
print(kcddf.groupby('defensive_win')['adj_defensive_score'].mean())
print(kcodf.groupby('offensive_win')['adj_offensive_score'].mean())

In [None]:
# Drive count splits.
print(kcddf.groupby('defensive_win')['drive_count'].mean())
print(kcodf.groupby('offensive_win')['drive_count'].mean())

In [None]:
# Drive time splits.
print(kcddf.groupby('defensive_win')['avg_drive_time'].mean())
print(kcodf.groupby('offensive_win')['avg_drive_time'].mean())

In [None]:
print(kcddf.groupby('defensive_win')['is_touchdown'].mean())
print(kcodf.groupby('offensive_win')['is_touchdown'].mean())

In [None]:
kcodf

In [None]:
kcddf

In [None]:
gdf.loc[gdf['season'] == 2018].sort_values('adj_offensive_score', ascending=False)

In [None]:
gdf['avg_drive_time'].describe([.1, .25, .5, .75, .9])

In [None]:
gdf.loc[gdf['season'] == 2018].groupby('offensive_team')['adj_offensive_score'].std()

In [None]:
ddf['drive_time'].describe([.1, .25, .5, .75, .9])

In [None]:
kcddf['avg_drive_time'].mean()

In [None]:
kcodf['avg_drive_time'].mean()

In [6]:
agg_columns = {
    'offensive_win': 'win', 
    'defensive_win': 'loss', 
    'tie': 'tie'
}
agg_list = [column for column in agg_columns.keys()]

wldf = gdf.groupby(['season', 'offensive_team'], as_index=False)[agg_list].sum()
agg_columns['offensive_team'] = 'team'
wldf = wldf.rename(agg_columns, axis=1)

print(wldf['win'].describe([.5, .75, .90, .95]))

wldf.head()

count    320.000000
mean       7.978125
std        3.078313
min        0.000000
50%        8.000000
75%       10.000000
90%       12.000000
95%       13.000000
max       15.000000
Name: win, dtype: float64


Unnamed: 0,season,team,win,loss,tie
0,2009,ARI,10,6,0
1,2009,ATL,9,7,0
2,2009,BAL,9,7,0
3,2009,BUF,6,10,0
4,2009,CAR,8,8,0


In [7]:
sodf = aggregate_season_drives(2009, 2018, 'offensive_team', exclude_playoffs=True)

rename_dict = {
    'offensive_team': 'team',
    'drive_score': 'offensive_score'
}

sodf = sodf.rename(rename_dict, axis=1)

sodf.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,adj_offensive_score,adj_offensive_score,offensive_score,offensive_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
season,offensive_team,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2009,ARI,-0.199518,0.675489,-0.177095,0.814903
2009,ATL,0.445022,0.913319,0.2915,0.908543
2009,BAL,0.27218,0.767176,0.220197,0.898049
2009,BUF,-0.596438,0.860568,-0.653791,0.776631
2009,CAR,-0.179352,0.816724,-0.275106,1.006341
2009,CHI,-0.230285,0.584803,-0.328305,0.708216
2009,CIN,-0.152866,0.948019,-0.2305,1.034822
2009,CLE,-0.651279,0.831284,-0.747611,1.013044
2009,DAL,0.312319,0.590812,0.293322,0.70082
2009,DEN,-0.263108,0.653284,-0.28585,0.72496


In [8]:
sodf.loc[sodf['season'] == 2018]

KeyError: 'season'

In [9]:
sddf = aggregate_season_drives(2009, 2018, 'defensive_team', exclude_playoffs=True)

rename_dict = {
    'defensive_team': 'team',
    'drive_score': 'defensive_score'
}

sddf = sddf.rename(rename_dict, axis=1)

sddf.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,adj_defensive_score,adj_defensive_score,defensive_score,defensive_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
season,defensive_team,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2009,ARI,-0.053338,0.495643,-0.181514,0.674668
2009,ATL,0.154772,0.7706,0.097989,1.059865
2009,BAL,-0.320488,0.569834,-0.379019,0.889826
2009,BUF,-0.256715,0.644227,-0.229831,0.737824
2009,CAR,-0.386619,0.682683,-0.315846,0.689732
2009,CHI,0.213443,1.110673,0.161798,1.176536
2009,CIN,-0.287315,0.627984,-0.383476,0.679914
2009,CLE,0.304263,0.989501,0.243571,1.097939
2009,DAL,-0.31035,0.690926,-0.329774,0.685971
2009,DEN,-0.245345,0.846629,-0.238395,0.893941


In [None]:
sddf.tail(10)

In [None]:
sddf.loc[sddf['season'] == 2018]

In [None]:
def merge_features(sodf, sddf, wldf):
    df = sodf.merge(sddf, on=['season', 'team'])
    df = df.merge(wldf, on=['season', 'team'])
    df = df.sort_values('season')
    return df

In [None]:
df = merge_features(sodf, sddf, wldf)
df['adj_team_score'] = df['adj_offensive_score'] - df['adj_defensive_score']
df['team_score'] = df['offensive_score'] - df['defensive_score']

df.head()

In [None]:
df.corr()

In [None]:
df_lag = df.copy()
df_lag['season'] = df_lag['season'] + 1
df_lag.columns = ['%s_lag' % column for column in df_lag.columns]

df_lag.head()

In [None]:
fdf = df.merge(
    df_lag, left_on=['season', 'team'], right_on=['season_lag', 'team_lag'], how='left'
)
fdf = fdf.dropna()

fdf.tail()

In [None]:
fdf.corr()

In [None]:
features = [
    'adj_offensive_score_lag', 'offensive_score_lag', 
    'adj_defensive_score_lag', 'defensive_score_lag', 
    'adj_team_score_lag', 'team_score_lag', 'win_lag'
]

In [None]:
from sklearn.linear_model import LinearRegression, Lasso 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
test_seasons = range(2015, 2019)
model = LinearRegression()

for test_season in test_seasons:
    train_df = fdf.loc[fdf['season'] < test_season].copy()
    test_df = fdf.loc[fdf['season'] == test_season].copy()
    y_test = test_df['win'].values
    y_scores = test_df['win_lag'].values
    print(test_season, mean_absolute_error(y_test, y_scores))

In [None]:
test_seasons = range(2015, 2019)
model = LinearRegression()

for test_season in test_seasons:
    train_df = fdf.loc[fdf['season'] < test_season].copy()
    test_df = fdf.loc[fdf['season'] == test_season].copy()
    X_train, y_train = train_df[features].values, train_df['win'].values
    X_test, y_test = test_df[features].values, test_df['win'].values
    model.fit(X_train, y_train)
    y_scores = model.predict(X_test)
    print(test_season, mean_absolute_error(y_test, y_scores))

In [None]:
test_seasons = range(2015, 2019)
model = KNeighborsRegressor(9)

for test_season in test_seasons:
    train_df = fdf.loc[fdf['season'] < test_season].copy()
    test_df = fdf.loc[fdf['season'] == test_season].copy()
    X_train, y_train = train_df[features].values, train_df['win'].values
    X_test, y_test = test_df[features].values, test_df['win'].values
    model.fit(X_train, y_train)
    y_scores = model.predict(X_test)
    print(test_season, mean_absolute_error(y_test, y_scores))

In [None]:
test_seasons = range(2015, 2019)
model = RandomForestRegressor(500)

for test_season in test_seasons:
    train_df = fdf.loc[fdf['season'] < test_season].copy()
    test_df = fdf.loc[fdf['season'] == test_season].copy()
    X_train, y_train = train_df[features].values, train_df['win'].values
    X_test, y_test = test_df[features].values, test_df['win'].values
    model.fit(X_train, y_train)
    y_scores = model.predict(X_test)
    print(test_season, mean_absolute_error(y_test, y_scores))

In [None]:
test_seasons = range(2015, 2019)
model = XGBRegressor(learning_rate=.01, n_estimators=500, max_depth=3,
                     objective='reg:squarederror')

for test_season in test_seasons:
    train_df = fdf.loc[fdf['season'] < test_season].copy()
    test_df = fdf.loc[fdf['season'] == test_season].copy()
    X_train, y_train = train_df[features].values, train_df['win'].values
    X_test, y_test = test_df[features].values, test_df['win'].values
    model.fit(X_train, y_train)
    y_scores = model.predict(X_test)
    print(test_season, mean_absolute_error(y_test, y_scores))

In [None]:
df.describe()

In [None]:
ddf.groupby(['season', 'offensive_team'])['game_id'].nunique().max()

### 2018 Chiefs
* Chiefs offense was second most productive offense in last 10 years -- first in 2018 by a long shot
* Chiefs defense was eighth least productive defense in last 10 years -- last in 2018
* Chiefs offensive score in losses was still top 10 in the last 10 years.
* How bad was the defense in losses comparatively?

In [None]:
kco = gdf.loc[(gdf['offensive_team'] == 'KC') & 
              (gdf['season'] == 2018)]
kco = kco.sort_values('game_id')

kco.groupby('offensive_win')['adj_offensive_score'].mean()

In [None]:
kcd = gdf.loc[(gdf['defensive_team'] == 'KC') & (gdf['season'] == 2018)]
kcd = kcd.sort_values('game_id')

kcd.groupby('defensive_win')['adj_defensive_score'].mean()

In [None]:
kco