In [52]:
import pandas as pd 

#Enter desired years of data
YEARS = [2019,2018,2017]

pbp = pd.DataFrame()

for i in YEARS:  
    #low_memory=False eliminates a warning
    i_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    #sort=True eliminates a warning and alphabetically sorts columns
    pbp = pbp.append(i_data, sort=True)

#Give each row a unique index
pbp.reset_index(drop=True, inplace=True)

In [53]:
# first set of model mutations - https://github.com/guga31bb/nflfastR-data/blob/106adcf0033fa741fe8be11a2d52232aecc26a5e/models/train_xyac_model.R#L27

# new columns https://github.com/guga31bb/nflfastR-data/blob/106adcf0033fa741fe8be11a2d52232aecc26a5e/models/train_xyac_model.R#L30
import regex # regex that supports variable-length positive lookbehind https://bitbucket.org/mrabarnett/mrab-regex/src/hg/
import math

def create_model_mutations(dfa):
    df = dfa.copy()
    df["distance_to_goal"] = df.yardline_100 - df.air_yards
    df["pass_middle"] = df.pass_location.apply(lambda x: 1 if (x == 'middle') else 0)
    df["air_is_zero"] = df.air_yards.apply(lambda x: 1 if (x == 0) else 0)
    df["distance_to_sticks"] = df.air_yards - df.ydstogo
    df["yards_after_catch"] = df.yards_after_catch.clip(-5, 70)
    df["receiver_player_name"] = df.desc.apply(lambda x: regex.search("(?<=((to)|(for))\\s[0-9]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?", x))
    # note: receiver_player_name contains regex.Match objects, not strings
    df["down1"] = df.down.apply(lambda x: 1 if (x == 1) else 0)
    df["down2"] = df.down.apply(lambda x: 1 if (x == 2) else 0)
    df["down3"] = df.down.apply(lambda x: 1 if (x == 3) else 0)
    df["down4"] = df.down.apply(lambda x: 1 if (x == 4) else 0)

    # from docs: 1 - 2006-2013, 2 - 2014-2017, 3 - 2018 and beyond
    df["era1"] = df.season.apply(lambda x: 1 if (x < 2006) else 0)
    df["era2"] = df.season.apply(lambda x: 1 if (x >= 2006 and x <= 2013) else 0)
    df["era3"] = df.season.apply(lambda x: 1 if (x >= 2014 and x <= 2017) else 0)
    df["era4"] = df.season.apply(lambda x: 1 if (x >= 2018) else 0)

    # roof: "outdoors", "retractable", "dome"; values: ['dome', 'outdoors', 'closed', 'open']
    df["dome"] = df.roof.apply(lambda x: 1 if (x == 'dome') else 0)
    df["outdoors"] = df.roof.apply(lambda x: 1 if (x == 'outdoors') else 0)
    df["retractable"] = df.roof.apply(lambda x: 1 if (x == 'closed' or x == 'open') else 0)

    # "home" -- assuming that offense is home
    df["home"] = df.apply(lambda x: 1 if (x.posteam == x.home_team) else 0, axis=1)

    # checking pass validity: https://github.com/mrcaseb/nflfastR/blob/master/R/helper_add_cp_cpoe.R#L64
    def check_pass_validity(row):
        return ((row.complete_pass == 1) | (row.incomplete_pass == 1) | (row.interception == 1)) & (~math.isnan(row.air_yards)) & (row.air_yards >= -15) & (row.air_yards < 70) & (row.receiver_player_name != None) & (row.pass_location != None)

    df["valid_pass"] = df.apply(lambda x: check_pass_validity(x), axis=1)

    return df

model_data = create_model_mutations(pbp)
model_data.head()

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,...,down4,era1,era2,era3,era4,dome,outdoors,retractable,home,valid_pass
0,0,,,,,,,,,,...,0,0,0,0,1,1,0,0,0,0
1,0,,,,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,0
2,0,,,,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,0
3,0,,,,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,0
4,0,,,,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,0


In [54]:
filtered_data = model_data[(~model_data.air_yards.isna()) & (model_data.air_yards >= -15) & (model_data.air_yards < 70) & (~model_data.receiver_player_name.isna()) & (~model_data.pass_location.isna())]
filtered_data

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,...,down4,era1,era2,era3,era4,dome,outdoors,retractable,home,valid_pass
7,0,-0.553484,0.000000,1.0,0.0,,,,,,...,0,0,0,0,1,1,0,0,1,1
8,0,0.974937,0.013921,11.0,0.0,,,,,,...,0,0,0,0,1,1,0,0,1,1
14,0,0.766806,0.014734,13.0,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,1
15,0,-0.544798,-0.015032,1.0,0.0,,,,,,...,0,0,0,0,1,1,0,0,1,1
22,0,-0.738814,-0.002456,0.0,0.0,,,,,,...,0,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143897,0,0.591138,-0.017589,13.0,0.0,,,,,,...,1,0,0,1,0,1,0,0,1,1
143899,0,-0.035157,-0.005672,3.0,0.0,,,,,,...,0,0,0,1,0,1,0,0,1,1
143900,0,-0.094967,-0.001816,2.0,0.0,,,,,,...,0,0,0,1,0,1,0,0,1,1
143902,0,0.410537,0.000000,11.0,0.0,,,,,,...,0,0,0,1,0,1,0,0,1,1


In [56]:
prepare_cp_data = filtered_data[["season", "complete_pass", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]]

prepare_cp_data.head()
prepare_cp_data.complete_pass.mean()

0.6531403849022825

In [57]:
import xgboost as xgb
import coremltools as cml

dtrain = xgb.DMatrix(prepare_cp_data[["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]], label=prepare_cp_data.complete_pass)
dtrain

<xgboost.core.DMatrix at 0x7ff251817710>

In [58]:
# from https://github.com/apple/coremltools/issues/605

# XGBoost model
nrounds = 560
params = {
    'objective': 'binary:logistic',
    'booster' : 'gbtree',
    'eval_metric' : ['logloss'],
    'eta' : 0.025,
    'gamma' : 5,
    'subsample' : 0.8,
    'colsample_bytree' : 0.8,
    'max_depth': 4,
    'min_child_weight' : 6,
    'base_score' : prepare_cp_data.complete_pass.mean()
}


xcp_model = xgb.train(params, dtrain, num_boost_round=nrounds, verbose_eval=2)
# save for debug
xcp_model.dump_model('xgb_dump.json', with_stats=True, dump_format='json')

In [61]:
# testing the model with 2020 data

base_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_2020.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

test_data = create_model_mutations(base_data)
test_data = test_data[(~test_data.air_yards.isna()) & (test_data.air_yards >= -15) & (test_data.air_yards < 70) & (~test_data.receiver_player_name.isna()) & (~test_data.pass_location.isna())]
dtest = xgb.DMatrix(test_data[["season", "air_yards", "yardline_100", "ydstogo",
    "down1", "down2", "down3", "down4", "air_is_zero", "pass_middle",
    "era2", "era3", "era4", "qb_hit", "home",
    "outdoors", "retractable", "dome", "distance_to_sticks"]], label=test_data.complete_pass)

xgb_predictions = xcp_model.predict(dtest).tolist()
print("XGBoost results:")
# print(xgb_predictions)
test_data['xCP'] = xgb_predictions
print(test_data['xCP'].mean())
print(test_data.complete_pass.mean())

XGBoost results:
0.6646040575625375
0.68875


In [None]:
# # CoreML model
# feature_names = list(map(lambda i: "f{}".format(i), range(0, n_features)))
# # force_32bit_float=False has no effect
# cml_model = cml.converters.xgboost.convert(xgb_model, force_32bit_float=False, feature_names=feature_names)
# # save for debug
# with open('cml_dump.txt', 'w') as txt_file:
#   txt_file.write(str(cml_model.get_spec()))

# cml_predictions = []
# for row in test_data:
#   named_data = dict(zip(feature_names, row))
#   cml_predictions.append(cml_model.predict(named_data)["target"])

# # fix basePredictionValue: 0.5
# cml_predictions = map(lambda x: x - 0.5, cml_predictions)
# # apply sigmoid
# cml_predictions = map(lambda x: 1 / (1 + np.exp(-x)), cml_predictions)

# print("CoreML:")
# print(list(cml_predictions))