In [22]:
# Based on https://github.com/akeaswaran/cfb_fourth_down but in Python

import pandas as pd
import numpy as np
import xgboost as xgb


def retrieveCfbDataFile(endpoint, year):
    return pd.read_csv(f"data/{endpoint}/{year}.csv", encoding='latin-1')

pbp = pd.DataFrame()
line_data = pd.DataFrame()

for x in range(2014, 2021):
    print(f"loading year: {x}")
    plys = pd.read_parquet(f"https://raw.githubusercontent.com/saiemgilani/cfbfastR-data/master/data/parquet/pbp_players_pos_{x}.parquet")
    ln = retrieveCfbDataFile('lines',x)
    print(f"loaded year: {x}")
    pbp = pbp.append(plys, sort=False)
    
    ln['year'] = x
    line_data = line_data.append(ln, sort=False)

print(f"Total Plays: {len(pbp)}")
print(f"Spreads imported: {len(line_data)}")

loading year: 2014


KeyboardInterrupt: 

In [2]:
model_vars = pbp[
    (pbp.down.isin([3,4]))
    & ((pbp["rush"] == 1) | (pbp["pass"] == 1))
    & (pbp.offense_play.notna())
    & (pbp.yards_to_goal.notna())
    & (pbp.score_diff.notna())
#     & (pbp.offense_conference.notna())
#     & (pbp.defense_conference.notna())
]
model_vars.head()

Unnamed: 0,year,week,id_play,game_id,game_play_number,half_play_number,drive_play_number,pos_team,def_pos_team,pos_team_score,...,lag_change_of_poss,lag_change_of_pos_team,lag_change_of_pos_team2,lag_kickoff_play,lag_punt,lag_punt2,lag_scoring_play,lag_turnover_vec,lag_downs_turnover,lag_defense_score_play
4,2014.0,1,4.005476e+17,400547640,4.0,4.0,4.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2014.0,1,4.005476e+17,400547640,7.0,7.0,7.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2014.0,1,4.005476e+17,400547640,11.0,11.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,2014.0,1,4.005476e+17,400547640,15.0,15.0,3.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,2014.0,1,4.005476e+17,400547640,19.0,19.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# model_line_data = line_data[
#     (line_data.lineProvider == "consensus")
# ]
# model_line_data.head()
# line_data.head()
grouped_lines = line_data.groupby("id")
grouped_lines = grouped_lines.apply(lambda x: x[(x.overUnder.notna()) & (x.spread.notna())].head(1))
grouped_lines = grouped_lines.reset_index(drop=True)
# first_values.head()
grouped_lines.head()

Unnamed: 0,id,homeTeam,homeScore,awayTeam,awayScore,lineProvider,overUnder,spread,formattedSpread,year
0,400547640,Vanderbilt,7.0,Temple,37.0,numberfire,50.0,-8.5,Vanderbilt -8.5,2014
1,400547641,Connecticut,10.0,BYU,35.0,numberfire,54.0,15.5,BYU -15.5,2014
2,400547642,UCF,24.0,Penn State,26.0,numberfire,44.0,-2.0,UCF -2,2014
3,400547644,Houston,7.0,UT San Antonio,27.0,numberfire,56.0,-9.5,Houston -9.5,2014
4,400547647,Tulsa,38.0,Tulane,31.0,numberfire,46.5,-4.5,Tulsa -4.5,2014


In [4]:
merged_vars = pd.merge(model_vars, grouped_lines[["id","spread","overUnder"]], left_on="game_id", right_on="id", how='left')
merged_vars.head()

Unnamed: 0,year,week,id_play,game_id,game_play_number,half_play_number,drive_play_number,pos_team,def_pos_team,pos_team_score,...,lag_kickoff_play,lag_punt,lag_punt2,lag_scoring_play,lag_turnover_vec,lag_downs_turnover,lag_defense_score_play,id,spread,overUnder
0,2014.0,1,4.005476e+17,400547640,4.0,4.0,4.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400547640.0,-8.5,50.0
1,2014.0,1,4.005476e+17,400547640,7.0,7.0,7.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400547640.0,-8.5,50.0
2,2014.0,1,4.005476e+17,400547640,11.0,11.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400547640.0,-8.5,50.0
3,2014.0,1,4.005476e+17,400547640,15.0,15.0,3.0,Temple,Vanderbilt,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400547640.0,-8.5,50.0
4,2014.0,1,4.005476e+17,400547640,19.0,19.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400547640.0,-8.5,50.0


In [7]:
model_vars.columns.to_list()

['year',
 'week',
 'id_play',
 'game_id',
 'game_play_number',
 'half_play_number',
 'drive_play_number',
 'pos_team',
 'def_pos_team',
 'pos_team_score',
 'def_pos_team_score',
 'half',
 'period',
 'clock.minutes',
 'clock.seconds',
 'play_type',
 'play_text',
 'down',
 'distance',
 'yards_to_goal',
 'yards_gained',
 'EPA',
 'ep_before',
 'ep_after',
 'wpa',
 'wp_before',
 'wp_after',
 'def_wp_before',
 'def_wp_after',
 'penalty_detail',
 'yds_penalty',
 'penalty_1st_conv',
 'new_series',
 'firstD_by_kickoff',
 'firstD_by_poss',
 'firstD_by_penalty',
 'firstD_by_yards',
 'def_EPA',
 'home_EPA',
 'away_EPA',
 'home_EPA_rush',
 'away_EPA_rush',
 'home_EPA_pass',
 'away_EPA_pass',
 'total_home_EPA',
 'total_away_EPA',
 'total_home_EPA_rush',
 'total_away_EPA_rush',
 'total_home_EPA_pass',
 'total_away_EPA_pass',
 'net_home_EPA',
 'net_away_EPA',
 'net_home_EPA_rush',
 'net_away_EPA_rush',
 'net_home_EPA_pass',
 'net_away_EPA_pass',
 'success',
 'epa_success',
 'rz_play',
 'scoring_opp',


In [8]:
merged_vars["first_down_penalty"] = merged_vars["firstD_by_penalty"]

merged_vars.yards_gained = merged_vars.yards_gained.clip(-10, 65)
merged_vars["home_total"] = (merged_vars.spread + merged_vars.overUnder) / 2
merged_vars["away_total"] = (merged_vars.overUnder - merged_vars.spread) / 2
merged_vars["posteam_total"] = np.where(merged_vars.offense_play == merged_vars.home, merged_vars.home_total, merged_vars.away_total)
merged_vars["posteam_spread"] = np.where(merged_vars.offense_play == merged_vars.home, merged_vars.spread, -1 * merged_vars.spread)

merged_vars.head()

Unnamed: 0,year,week,id_play,game_id,game_play_number,half_play_number,drive_play_number,pos_team,def_pos_team,pos_team_score,...,lag_downs_turnover,lag_defense_score_play,id,spread,overUnder,first_down_penalty,home_total,away_total,posteam_total,posteam_spread
0,2014.0,1,4.005476e+17,400547640,4.0,4.0,4.0,Temple,Vanderbilt,0,...,0.0,0.0,400547640.0,-8.5,50.0,0.0,20.75,29.25,29.25,8.5
1,2014.0,1,4.005476e+17,400547640,7.0,7.0,7.0,Temple,Vanderbilt,0,...,0.0,0.0,400547640.0,-8.5,50.0,0.0,20.75,29.25,29.25,8.5
2,2014.0,1,4.005476e+17,400547640,11.0,11.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,400547640.0,-8.5,50.0,0.0,20.75,29.25,20.75,-8.5
3,2014.0,1,4.005476e+17,400547640,15.0,15.0,3.0,Temple,Vanderbilt,0,...,0.0,0.0,400547640.0,-8.5,50.0,0.0,20.75,29.25,29.25,8.5
4,2014.0,1,4.005476e+17,400547640,19.0,19.0,3.0,Vanderbilt,Temple,0,...,0.0,0.0,400547640.0,-8.5,50.0,0.0,20.75,29.25,20.75,-8.5


In [15]:
filtered_vars = merged_vars[
    (((merged_vars["rush"] + merged_vars["pass"]) == 1) | (merged_vars.first_down_penalty == 1))
    & (merged_vars.distance > 0)
    & (merged_vars.yards_to_goal > 0)
    & (merged_vars.distance > merged_vars.yards_to_goal)
    & (merged_vars.posteam_total.notna())
    & (merged_vars.posteam_spread.notna())
]
filtered_vars["label"] = (filtered_vars.yards_gained.astype(float) + 10).astype(int)
filtered_vars = filtered_vars[["down","distance","yards_to_goal","posteam_total","posteam_spread","label"]]
filtered_vars.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,down,distance,yards_to_goal,posteam_total,posteam_spread,label
1300,3.0,10.0,4.0,21.25,-7.5,14
21061,3.0,10.0,9.0,26.5,-4.0,19
32675,3.0,18.0,17.0,16.75,-32.0,10
41662,3.0,10.0,1.0,25.0,-10.0,11
69430,3.0,13.0,9.0,12.5,-18.0,19


In [16]:
nrounds = 157
params = {
    "booster": "gbtree",
    "objective": "multi:softprob",
    "eval_metric": "mlogloss",
    "num_class": 76,
    "eta": .07,
    "gamma": 4.325037e-09,
    "subsample": 0.5385424,
    "colsample_bytree": 0.6666667,
    'max_depth': 4,
    "min_child_weight": 7
}
full_train = xgb.DMatrix(filtered_vars[["down","distance","yards_to_goal","posteam_total","posteam_spread"]], label = filtered_vars.label)
fd_model = xgb.train(params, full_train, nrounds)
fd_model
# full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = model_vars %>% dplyr::select(-label)), label = as.integer(model_vars$label))
# fd_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2)

<xgboost.core.Booster at 0x7fc58346f3d0>

In [14]:
fd_model.save_model("fd_model.model")

In [21]:
import coremltools as cml
cml_model = cml.converters.xgboost.convert(fd_model, force_32bit_float=False, mode="classifier", feature_names=[
    "down","distance","yards_to_goal","posteam_total","posteam_spread"
], n_classes=76)
cml_model.author = 'Jason Lee for original R code; Akshay Easwaran for Python and CoreML conversion.'
cml_model.license = 'MIT'
cml_model.short_description = 'Projects number of yards gained on a fourth-down play.'
cml_model

# # Set feature descriptions manually
cml_model.input_description['posteam_spread'] = 'The spread for the game from the current offense\'s perspective. Note that a home favorite will have a negative spread value.'
cml_model.input_description['posteam_total'] = 'The over/under for the game from the current offense\'s perspective.'
cml_model.input_description['yards_to_goal'] = 'The yards left to gain towards the end zone.'
cml_model.input_description['distance'] = 'The number of yards to gain a first down.'
cml_model.input_description['down'] = 'The current down.'

# cml_model.output_description['target'] = "The projected number of yards gained on this fourth-down play."

# # # Save the model
cml_model.save('FourthDownYards.mlmodel')
cml_model

input {
  name: "down"
  shortDescription: "The current down."
  type {
    doubleType {
    }
  }
}
input {
  name: "distance"
  shortDescription: "The number of yards to gain a first down."
  type {
    doubleType {
    }
  }
}
input {
  name: "yards_to_goal"
  shortDescription: "The yards left to gain towards the end zone."
  type {
    doubleType {
    }
  }
}
input {
  name: "posteam_total"
  shortDescription: "The over/under for the game from the current offense\'s perspective."
  type {
    doubleType {
    }
  }
}
input {
  name: "posteam_spread"
  shortDescription: "The spread for the game from the current offense\'s perspective. Note that a home favorite will have a negative spread value."
  type {
    doubleType {
    }
  }
}
output {
  name: "target"
  type {
    int64Type {
    }
  }
}
output {
  name: "classProbability"
  type {
    dictionaryType {
      int64KeyType {
      }
    }
  }
}
predictedFeatureName: "target"
predictedProbabilitiesName: "classProbability"
metad