In [1]:
import pandas as pd 

#Enter desired years of data
YEARS = range(2006, 2020)

pbp = pd.DataFrame()

for i in YEARS:  
    #low_memory=False eliminates a warning
    i_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    #sort=True eliminates a warning and alphabetically sorts columns
    pbp = pbp.append(i_data, sort=True)

#Give each row a unique index
pbp.reset_index(drop=True, inplace=True)

In [5]:
# first set of model mutations - https://github.com/guga31bb/nflfastR-data/blob/106adcf0033fa741fe8be11a2d52232aecc26a5e/models/train_xyac_model.R#L27

# new columns https://github.com/guga31bb/nflfastR-data/blob/106adcf0033fa741fe8be11a2d52232aecc26a5e/models/train_xyac_model.R#L30
import regex # regex that supports variable-length positive lookbehind https://bitbucket.org/mrabarnett/mrab-regex/src/hg/
import math

def create_model_mutations(dfa):
    df = dfa.copy()
    df["distance_to_goal"] = df.yardline_100 - df.air_yards
    df["pass_middle"] = df.pass_location.apply(lambda x: 1 if (x == 'middle') else 0)
    df["air_is_zero"] = df.air_yards.apply(lambda x: 1 if (x == 0) else 0)
    df["distance_to_sticks"] = df.air_yards - df.ydstogo
    df["yards_after_catch"] = df.yards_after_catch.clip(-5, 70)
    df["receiver_player_name"] = df.desc.apply(lambda x: regex.search("(?<=((to)|(for))\\s[0-9]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?", x))
    # note: receiver_player_name contains regex.Match objects, not strings
    df["down1"] = df.down.apply(lambda x: 1 if (x == 1) else 0)
    df["down2"] = df.down.apply(lambda x: 1 if (x == 2) else 0)
    df["down3"] = df.down.apply(lambda x: 1 if (x == 3) else 0)
    df["down4"] = df.down.apply(lambda x: 1 if (x == 4) else 0)

    # from docs: 1 - 2006-2013, 2 - 2014-2017, 3 - 2018 and beyond
    df["era1"] = df.season.apply(lambda x: 1 if (x < 2006) else 0)
    df["era2"] = df.season.apply(lambda x: 1 if (x >= 2006 and x <= 2013) else 0)
    df["era3"] = df.season.apply(lambda x: 1 if (x >= 2014 and x <= 2017) else 0)
    df["era4"] = df.season.apply(lambda x: 1 if (x >= 2018) else 0)

    # roof: "outdoors", "retractable", "dome"; values: ['dome', 'outdoors', 'closed', 'open']
    df["dome"] = df.roof.apply(lambda x: 1 if (x == 'dome') else 0)
    df["outdoors"] = df.roof.apply(lambda x: 1 if (x == 'outdoors') else 0)
    df["retractable"] = df.roof.apply(lambda x: 1 if ((x == 'closed') or (x == 'open') or (pd.isnull(x)) or (x == None)) else 0)

    # "home" -- assuming that offense is home
    df["home"] = df.apply(lambda x: 1 if (x.posteam == x.home_team) else 0, axis=1)

    # checking pass validity: https://github.com/mrcaseb/nflfastR/blob/master/R/helper_add_cp_cpoe.R#L64
    def check_pass_validity(row):
        return ((row.complete_pass == 1) | (row.incomplete_pass == 1) | (row.interception == 1)) & (~math.isnan(row.air_yards)) & (row.air_yards >= -15) & (row.air_yards < 70) & (row.receiver_player_name != None) & (row.pass_location != None)

    df["valid_pass"] = df.apply(lambda x: check_pass_validity(x), axis=1)

    return df

model_data = create_model_mutations(pbp)
model_data.head()

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,...,down4,era1,era2,era3,era4,dome,outdoors,retractable,home,valid_pass
0,0,,,,1.0,32013030-2d30-3031-3939-3231db5ac90c,J.Baker,CAR,32013030-2d30-3032-3432-3733f1f1361e,R.Marshall,...,0,0,1,0,0,0,1,0,0,0
1,0,,,,1.0,32013030-2d30-3032-3038-3637fb7fb875,M.Kemoeatu,CAR,00-0020380,K.Jenkins,...,0,0,1,0,0,0,1,0,0,0
2,0,,,,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,0
3,0,1.773846,0.05181,9.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,1
4,0,,,,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,0


In [6]:
filtered_data = model_data[(~model_data.air_yards.isna()) & (model_data.air_yards >= -15) & (model_data.air_yards < 70) & (~model_data.receiver_player_name.isna()) & (~model_data.pass_location.isna())]
filtered_data

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,...,down4,era1,era2,era3,era4,dome,outdoors,retractable,home,valid_pass
3,0,1.773846,0.051810,9.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,1
5,0,0.184565,-0.005514,5.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,1
7,0,0.285128,0.000000,6.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,0,1
14,0,0.383261,0.007829,6.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,1,1
15,0,0.645990,0.013199,9.0,0.0,,,,,,...,0,0,1,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669899,0,0.599199,0.017857,9.0,0.0,,,,,,...,0,0,0,0,1,0,1,0,0,1
669900,0,0.322293,0.033869,7.0,0.0,,,,,,...,0,0,0,0,1,0,1,0,0,1
669901,0,3.797892,0.316607,44.0,0.0,,,,,,...,0,0,0,0,1,0,1,0,0,1
669908,0,-0.171758,-0.002598,0.0,0.0,,,,,,...,0,0,0,0,1,0,1,0,0,1


In [7]:
tester = model_data[(model_data.home_team == 'ATL') & (model_data.valid_pass == True) & (model_data.season == 2019)]
tester[["season", "complete_pass", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks",'cp']].head(10).to_dict(orient="records")

[{'season': 2019,
  'complete_pass': 1.0,
  'air_yards': 6.0,
  'yardline_100': 72.0,
  'ydstogo': 7,
  'down1': 0,
  'down2': 1,
  'down3': 0,
  'down4': 0,
  'air_is_zero': 0,
  'pass_middle': 0,
  'era2': 0,
  'era3': 0,
  'era4': 1,
  'qb_hit': 0.0,
  'home': 1,
  'outdoors': 0,
  'retractable': 1,
  'dome': 0,
  'distance_to_sticks': -1.0,
  'cp': 0.7769984602928162},
 {'season': 2019,
  'complete_pass': 1.0,
  'air_yards': -1.0,
  'yardline_100': 57.0,
  'ydstogo': 4,
  'down1': 0,
  'down2': 1,
  'down3': 0,
  'down4': 0,
  'air_is_zero': 0,
  'pass_middle': 0,
  'era2': 0,
  'era3': 0,
  'era4': 1,
  'qb_hit': 0.0,
  'home': 1,
  'outdoors': 0,
  'retractable': 1,
  'dome': 0,
  'distance_to_sticks': -5.0,
  'cp': 0.8788162469863892},
 {'season': 2019,
  'complete_pass': 1.0,
  'air_yards': 7.0,
  'yardline_100': 40.0,
  'ydstogo': 10,
  'down1': 1,
  'down2': 0,
  'down3': 0,
  'down4': 0,
  'air_is_zero': 0,
  'pass_middle': 1,
  'era2': 0,
  'era3': 0,
  'era4': 1,
  'qb_hit

In [8]:
prepare_cp_data = filtered_data[["season", "complete_pass", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]]
print("Overall Mean:", prepare_cp_data.complete_pass.mean())
prepare_cp_data.head()


Overall Mean: 0.629519810232081


Unnamed: 0,season,complete_pass,air_yards,yardline_100,ydstogo,down1,down2,down3,down4,air_is_zero,pass_middle,era2,era3,era4,qb_hit,home,outdoors,retractable,dome,distance_to_sticks
3,2006,1.0,9.0,63.0,6,0,0,1,0,0,0,1,0,0,0.0,0,1,0,0,3.0
5,2006,1.0,5.0,38.0,10,1,0,0,0,0,0,1,0,0,0.0,0,1,0,0,-5.0
7,2006,0.0,6.0,27.0,10,1,0,0,0,0,1,1,0,0,0.0,0,1,0,0,-4.0
14,2006,1.0,6.0,57.0,7,0,1,0,0,0,0,1,0,0,0.0,1,1,0,0,-1.0
15,2006,1.0,9.0,49.0,10,1,0,0,0,0,0,1,0,0,0.0,1,1,0,0,-1.0


In [9]:
import xgboost as xgb

dtrain = xgb.DMatrix(prepare_cp_data[["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]], label=prepare_cp_data.complete_pass)
dtrain

<xgboost.core.DMatrix at 0x7fde423c6690>

In [11]:
# from https://github.com/apple/coremltools/issues/605

# XGBoost model
nrounds = 560
params = {
    'objective': 'binary:logistic',
    'booster' : 'gbtree',
    'eval_metric' : ['logloss'],
    'eta' : 0.025,
    'gamma' : 5,
    'subsample' : 0.8,
    'colsample_bytree' : 0.8,
    'max_depth': 4,
    'min_child_weight' : 6,
    'base_score' : prepare_cp_data.complete_pass.mean()
}


xcp_model = xgb.train(params, dtrain, num_boost_round=nrounds, verbose_eval=2)
# save for debug
xcp_model.dump_model('xgb_dump.json', with_stats=True, dump_format='json')

In [12]:
# testing the model with 2020 data

base_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_2020.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

test_data = create_model_mutations(base_data)
test_data = test_data[(~test_data.air_yards.isna()) & (test_data.air_yards >= -15) & (test_data.air_yards < 70) & (~test_data.receiver_player_name.isna()) & (~test_data.pass_location.isna())]
dtest = xgb.DMatrix(test_data[["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]], label=test_data.complete_pass)

xgb_predictions = xcp_model.predict(dtest).tolist()
print("XGBoost mean vs actual mean:")
# print(xgb_predictions)
test_data['xCP'] = xgb_predictions
print(test_data['xCP'].mean())
print(test_data.complete_pass.mean())

XGBoost mean vs actual mean:
0.6646639428054913
0.68875


In [13]:
# # CoreML model
# feature_names = list(map(lambda i: "f{}".format(i), range(0, n_features)))
# # force_32bit_float=False has no effect
import coremltools as cml
cml_model = cml.converters.xgboost.convert(xcp_model, force_32bit_float=False, feature_names=["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"], target="complete_pass")
cml_model.author = 'Ben Baldwin (@benbbaldwin on Twitter, @guga31bb on GitHub) and Sebastian Carl (@mrcaseb on Twitter/GitHub) for R code; Akshay Easwaran for Python port.'
cml_model.license = 'MIT'
cml_model.short_description = 'Predicts the expected completion percentage of a given pass attempt. Translated from R, original model available as part of https://github.com/mrcaseb/nflfastR/.'

# Set feature descriptions manually
cml_model.input_description['season'] = 'Season in which the pass was thrown.'
cml_model.input_description['air_yards'] = 'Yards traveled by the ball in the air before it was caught, batted down, intercepted, or fell incomplete.'
cml_model.input_description['down1'] = 'Signifies first down.'
cml_model.input_description['down2'] = 'Signifies second down.'
cml_model.input_description['down3'] = 'Signifies third down.'
cml_model.input_description['down4'] = 'Signifies fourth down.'
cml_model.input_description['air_is_zero'] = 'If the pass attempt went for zero air yards.'
cml_model.input_description['pass_middle'] = 'If the pass attempt was thrown over the middle of the field.'
cml_model.input_description['era2'] = 'Signifies the second air-yards era in the NFL, from 2006 to 2013.'
cml_model.input_description['era3'] = 'Signifies the third air-yards era in the NFL, from 2014 to 2017.'
cml_model.input_description['era4'] = 'Signifies the fourth air-yards era in the NFL, from 2018 to present.'
cml_model.input_description['qb_hit'] = 'If the QB throwing the pass was hit as the ball was thrown.'
cml_model.input_description['home'] = 'If the current offense is the home team in the game.'
cml_model.input_description['outdoors'] = 'Signifies that the game is being played in an outdoor stadium.'
cml_model.input_description['retractable'] = 'Signifies that the game is being played in a retractable-roof stadium, regardless of whether said roof is closed or open.'
cml_model.input_description['dome'] = 'Signifies that the game is being played in a domed/indoor stadium.'
cml_model.input_description['distance_to_sticks'] = 'The difference between the air yards of the pass and the yards to gain for a first down.'


# Set the output descriptions
cml_model.output_description['complete_pass'] = 'The chance of the pass attempt being completed.'

# Save the model
cml_model.save('NFLxCP.mlmodel')
# cml_model.get_spec()



In [14]:
import numpy as np

print("Actual Min:",test_data.complete_pass.min())
print("Actual Mean:",test_data.complete_pass.mean())
print("Actual Max:",test_data.complete_pass.max())

print("XGB Min:",test_data.xCP.min())
print("XGB Mean:",test_data.xCP.mean())
print("XGB Max:",test_data.xCP.max())

# cml.__version__
feature_names = ["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]
test_stripped = test_data[["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]]

based = test_stripped.to_dict(orient="records")

cml_predictions = []
for row in based:
    pred = cml_model.predict(row)
    cml_predictions.append(pred["complete_pass"])


# fix basePredictionValue: 0.5
# cml_predictions = map(lambda x: x - 0.5, cml_predictions)
# apply sigmoid
cml_predictions = map(lambda x: 1 / (1 + np.exp(-x)), cml_predictions)

items = np.fromiter(cml_predictions, dtype=np.float)

print("CoreML Min:",np.min(items))
print("CoreML Mean:",np.mean(items))
print("CoreML Max:",np.max(items)) # there's a hundredth error from this and XGB? somehow?

Actual Min: 0.0
Actual Mean: 0.68875
Actual Max: 1.0
XGB Min: 0.19434265792369843
XGB Mean: 0.6646639428054913
XGB Max: 0.9188105463981628
CoreML Min: 0.18966411400767205
CoreML Mean: 0.6587796153428717
CoreML Max: 0.9165322236124129


In [15]:
pbp.roof.unique()

array(['outdoors', 'dome', 'closed', 'open'], dtype=object)

In [17]:
test_data[(test_data.home_team == "ATL")].retractable

2340    1
2341    1
2344    1
2348    1
2349    1
       ..
6217    1
6220    1
6221    1
6222    1
6223    1
Name: retractable, Length: 175, dtype: int64