In [15]:
# note: old version of sklearn in conda environment

import sklearn
sklearn.__version__

'0.19.2'

In [16]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import re
# import requests
import time
from sklearn.ensemble import GradientBoostingClassifier
import joblib
# import statsmodels.api as sm
# import statsmodels.formula.api as smf
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

# first one here is from https://github.com/meysubb/cfbscrapR-MISC/blob/master/EPA_WPA/02-EPA-Model.R#L245
reg_equation_cfb = 'drive_point ~ time_remaining + adjusted_yardline + C(down) + log_distance + goal_to_go + under_two + log_distance*C(down) + adjusted_yardline*C(down) + goal_to_go*log_distance'
reg_equation_spl = 'drive_point ~ C(down) + distance + adjusted_yardline + C(down):distance + C(down):adjusted_yardline + period + margin'
reg_equation_ake = 'drive_point ~ C(down) + distance + adjusted_yardline + C(down):distance + C(down):adjusted_yardline + period + margin + time_remaining'
reg_equation = reg_equation_spl




In [17]:
drive_data = pd.DataFrame()
game_data = pd.DataFrame()
play_data = pd.DataFrame()

def retrieveCfbDataFile(endpoint, year):
    return pd.read_csv(f"data/{endpoint}/{year}.csv", encoding='latin-1')

for i in range(2012, 2020):
    drive = retrieveCfbDataFile('drives',i)
    drive['year'] = i
    drive_data = drive_data.append(drive, sort=False)
    
    gm = retrieveCfbDataFile('games',i)
    gm['year'] = i
    game_data = game_data.append(gm, sort=False)
    
    plys = retrieveCfbDataFile('pbp',i)
    plys['year'] = i
    play_data = play_data.append(plys, sort=False)

print(f"Total Games: {len(game_data)}")
print(f"Total Drives: {len(drive_data)}")
print(f"Total Plays: {len(play_data)}")

Total Games: 6644
Total Drives: 171692
Total Plays: 1210147


In [18]:
# drop FCS games 
play_data = play_data[(play_data['offense_conference'].notna()) & (play_data['defense_conference'].notna())]
print(f"Total FBS Plays: {len(play_data)}")

Total FBS Plays: 1055257


In [19]:
game_data['game_id'] = game_data['id']
data = pd.merge(drive_data,game_data,on='game_id')
data['drive_id'] = data['id_x']

In [20]:
pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')

In [21]:
pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)
pbp_data['adjusted_yardline'] = 100*(1-pbp_data['coef']) +  (2*pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API
pbp_data['margin'] = pbp_data['offense_score'] - pbp_data['defense_score']
pbp_data.loc[pbp_data.down > 4, 'down'] = 4
pbp_data.loc[pbp_data.period > 4, 'period'] = 5

In [22]:
# from cfbscrapR

pbp_data["time_remaining"] = pbp_data.apply(lambda x: ((4 - x.period) * 60 * 15) + (60 * x["clock.minutes"]) + x["clock.seconds"], axis=1)
pbp_data["log_distance"] = np.log(pbp_data.distance)
pbp_data["goal_to_go"] = pbp_data.apply(lambda x: (x.distance >= (x.adjusted_yardline - 17)) if ("Field Goal" in x.play_type) else (x.distance >= x.adjusted_yardline), axis=1)
pbp_data["under_two"] = pbp_data.time_remaining.apply(lambda x: x <= 120)




In [23]:
data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x ==   'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'FUMBLE TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH'  or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD' or x == 'DOWNS TD') else 0 )))

In [24]:
data['next_drive_point'] = -data['drive_point'].shift(-1).clip(lower=-2)

In [25]:
data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']

In [26]:
pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])

In [27]:
exclude_playtype = ['Kickoff',  'End Period',
        'Kickoff Return (Offense)',
       'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game', 'Timeout','placeholder']

game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)',"END OF 4TH QUARTER"]

regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline < 100) & (pbp_data.down > 0) &(pbp_data.distance > 0) & (pbp_data.adjusted_yardline>0) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()

In [28]:
from patsy import dmatrices

# int_conv = regression_df.astype({"drive_point":int, "down": int, "distance":int, "adjusted_yardline": int, "period": int, "margin": int, "time_remaining": int})
y, X = dmatrices(reg_equation, regression_df, return_type='dataframe')

feature_cols = ["Intercept", "C(down)[T.2]", "C(down)[T.3]", "C(down)[T.4]", "distance", "C(down)[T.2]:distance", "C(down)[T.3]:distance", "C(down)[T.4]:distance", "adjusted_yardline", "C(down)[T.2]:adjusted_yardline", "C(down)[T.3]:adjusted_yardline", "C(down)[T.4]:adjusted_yardline", "period", "margin"]
for c in feature_cols:
    X[c] = X[c].astype(int)
y.drive_point = y.drive_point.astype(int)

In [29]:
# pd.merge(X, y, left_index=True,right_index=True).to_csv("./combined.csv", index=False,encoding="utf8")

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators = 200)
clf.fit(X, y)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [31]:
special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal',
                          'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']
timing_play_type = ['End Period','End of Game','Timeout','End of Half',"END OF GAME"]
turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception','Punt',
                     'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return', 'Missed Field Goal Return Touchdown']
regular_play_type = ['Pass', 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']
time_play = ['End Period','Timeout','End of Half','End of Game',"END OF GAME"]
PAT_miss_type= [ 'PAT MISSED','PAT failed', 'PAT blocked', 'PAT BLOCKED']
off_TD = ['Passing Touchdown','Rushing Touchdown']
def_TD = ['Interception Return Touchdown','Fumble Return Touchdown', 'Missed Field Goal Return Touchdown','Blocked Punt Touchdown','Punt Return Touchdown']

In [32]:
regular_play = pbp_data[~pbp_data.play_type.str.contains('Kickoff') & ~(pbp_data.play_type.isin(time_play)) &(pbp_data.down > 0) & (pbp_data.distance > 0)]
# regular_play.to_csv("./regular_pbp.csv", index=False,encoding="utf8")

In [33]:
CFB_teams_list = pd.read_csv('https://raw.githubusercontent.com/903124/CFB_EPA_data/master/.ipynb_checkpoints/cfb_teams_list-checkpoint.csv',encoding='utf-8')

In [34]:
CFB_teams_list.full_name.unique()

array(['Abilene Christian', 'Air Force', 'Akron', 'Alabama',
       'Alabama A&M', 'Albany', 'Alcorn State', 'Appalachian State',
       'Arizona', 'Arizona State', 'Arkansas', 'Arkansas State',
       'Arkansas-Pine Bluff', 'Army', 'Auburn', 'Austin Peay', 'BYU',
       'Ball State', 'Baylor', 'Bethune-Cookman', 'Boise State',
       'Boston College', 'Bowling Green', 'Buffalo', 'California',
       'Campbell', 'Central Arkansas', 'Central Connecticut',
       'Central Michigan', 'Charleston Southern', 'Charlotte',
       'Cincinnati', 'Clemson', 'Coastal Carolina', 'Colorado',
       'Colorado State', 'Connecticut', 'Delaware State', 'Drake', 'Duke',
       'Duquesne', 'East Carolina', 'Eastern Illinois',
       'Eastern Kentucky', 'Eastern Michigan', 'Eastern Washington',
       'Elon', 'Florida', 'Florida Atlantic', 'Florida International',
       'Florida State', 'Fordham', 'Fresno State', 'Gardner-Webb',
       'Georgia', 'Georgia Southern', 'Georgia State', 'Georgia Tech',
     

In [35]:
regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])
regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)
regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])
regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)

In [36]:
# there are some plays in prev seasons that have no drive_point set (and ended up being at the end of a game); this broke the produced dmatrices bc it drops NAs in the outcome var
# regular_play[regular_play.drive_point.isna() == True]
regular_play.loc[regular_play.drive_point.isna() == True, "drive_point"] = 0.0
# regular_play[regular_play.drive_result == "END OF GAME"].head()

In [37]:
y_test, X_test = dmatrices(reg_equation, regular_play, return_type='dataframe')
EP_predict = clf.predict_proba(X_test)


test_df = pd.merge(X_test, y_test, left_index=True,right_index=True)
# test_df.to_csv("./test_df.csv", index=False,encoding="utf8")

In [38]:
# see above cell, we were having some issues with NAs for drive_point for a couple of plays that got dropped by dmatrices

#len(X_test)
# test_reg_play = regular_play[['down','distance','adjusted_yardline', 'period', 'margin']]
# len(regular_play) - len(X_test)
# test_x_play = X_test[['down','distance','adjusted_yardline', 'period', 'margin']]
# pd.concat([test_x_play,test_reg_play]).drop_duplicates(keep=False)

In [39]:
EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7
# print(len(EP))
regular_play['EP_start'] = EP

In [40]:
regular_play['new_yardline']= 0
regular_play['new_down']= 0
regular_play['new_distance']= 0
regular_play['turnover'] = 0

In [41]:
regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')]

In [42]:

regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1
regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10

regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1
regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10

regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1
regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained

regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50

In [43]:
regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) 
regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1
regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10

regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)
regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1
regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained

In [44]:

#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation

temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] 
temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]
regular_play.loc[temp_df[temp_df.play_text.str.contains('\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\d+', regex=True)].split_string.str.extract(r'(\d+)').astype(float)).ravel()

temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]
temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]
regular_play.loc[temp_df[temp_df.play_text.str.contains('\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\d+', regex=True)].split_string.str.extract(r'(\d+)').astype(float)).ravel()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [45]:
regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained 
regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained

regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline

regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80
regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1

#Fake data for model prediction, EP will be changed after processing the data

regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Dummy yardline for Safety

regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Dummy new down for Offensive tocuhdown play
regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance']  = 10 #Dummy new yards to go for Offensive tocuhdown play

regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD)),'new_yardline'] = 99  #Dummy yardline for Offensive tocuhdown play

regular_play.loc[(regular_play.play_type == 'Field Goal Good'), 'new_down'] = 1

regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down
regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 
regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10
regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline


regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0))  & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack
regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 
regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return') & ~(regular_play.play_type.isin(special_team_play_type))), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)
regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 
regular_play.loc[regular_play.play_type.isin(turnover_play_type),'turnover'] = 1

regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API
regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10

regular_play.loc[regular_play.play_type == 'Field Goal Good', 'new_yardline'] = 20 #Dummy yardline after success field goal
regular_play.loc[regular_play.play_type == 'Field Goal Missed', 'new_yardline'] = (100-regular_play.adjusted_yardline).clip(upper=80)
regular_play.loc[regular_play.play_type == 'Blocked Field Goal', 'new_yardline'] = (100-regular_play.adjusted_yardline)

regular_play.loc[regular_play.play_type == 'Punt', 'new_yardline'] = (100-regular_play.new_yardline)

In [46]:

regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained

In [47]:
regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99

In [48]:

regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]

In [49]:
out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline'],'margin':regular_play['margin'], 'period':regular_play['period'],'drive_point':regular_play['drive_point'], 'log_distance':regular_play['log_distance'],'goal_to_go':regular_play['goal_to_go'],'time_remaining':regular_play['time_remaining'], 'under_two':regular_play['under_two']})




In [50]:
y_end, X_end = dmatrices(reg_equation, out_df, return_type='dataframe')

# end_df = pd.merge(X_end, y_end, left_index=True,right_index=True)
# end_df.to_csv("./out_df.csv", index=False,encoding="utf8")


EP_predict = clf.predict_proba(X_end)
EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7

In [51]:
regular_play['EP_end'] = EP
# regular_play.to_csv('./regular_pbp.csv', index=False, encoding="utf8")

In [52]:
regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD) | regular_play.play_text.str.contains('TOUCHDOWN') | regular_play.play_text.str.contains(' TD ')  ),'EP_end'] = 7
regular_play.loc[(regular_play.play_type.isin(PAT_miss_type)),'EP_end'] = 6
regular_play.loc[regular_play.play_type == 'Field Goal Good','EP_end'] = 3

In [53]:
regular_play.loc[(regular_play.play_type.isin(turnover_play_type)| regular_play.turnover == 1),'EP_end'] *= -1

In [54]:
regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2

In [55]:
regular_play.loc[(regular_play.play_type == 'Extra Point Missed') | (regular_play.play_type == 'Extra Point Good') |(regular_play.play_type == '2pt Conversion') ,'EP_start'] = 0
regular_play.loc[(regular_play.play_type == 'Extra Point Missed'),'EP_end'] = -1
regular_play.loc[(regular_play.play_type == 'Extra Point Good'),'EP_end'] = 0
regular_play.loc[(regular_play.play_type == '2pt Conversion'),'EP_end'] = 1

In [56]:
regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']

In [57]:
pass_play_type = ["Interception Return Touchdown","Pass Interception","Pass Incompletion","Pass Interception Return","Pass Reception","Pass Completion","Pass","Passing Touchdown","Sack"]
rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']

In [58]:
regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()

0.17814310169263328

In [59]:
regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()

0.011649864567422858

In [60]:
regular_play.groupby('play_type')['EPA'].mean()

play_type
Blocked Field Goal                   -1.538043
Blocked Punt                         -0.153293
Blocked Punt Touchdown               -5.284150
Defensive 2pt Conversion             -8.527475
Field Goal Good                       1.568960
Field Goal Missed                    -1.590957
Fumble Recovery (Opponent)           -4.206185
Fumble Recovery (Own)                -0.944642
Fumble Return Touchdown              -7.312777
Interception Return Touchdown        -7.018139
Missed Field Goal Return             -4.898325
Missed Field Goal Return Touchdown   -8.408731
Pass                                 -1.724100
Pass Completion                       1.117568
Pass Incompletion                    -0.826096
Pass Interception                    -4.092013
Pass Interception Return             -3.110134
Pass Reception                        0.945858
Passing Touchdown                     3.486300
Penalty                              -0.722662
Punt                                 -0.270699
Pun

In [61]:
regular_play[regular_play.play_type.isin(pass_play_type)]['ppa'].mean()

0.1744561805473702

In [62]:
regular_play[regular_play.play_type.isin(rush_play_type)]['ppa'].mean()

0.07111946962674275

In [63]:
regular_play[(regular_play.play_type.isin(rush_play_type))].sort_values(by="EPA",ascending=True).head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,drive_point,drive_result,off_full_name,off_abbr,def_full_name,def_abbr,EP_start,new_yardline,new_down,new_distance,turnover,EP_end,EPA
732426,400547867101899001,TCU,Big 12,Oklahoma,Big 12,TCU,Oklahoma,7,0,4005478671,1,10,9,99,3,1,0,Fumble Return Touchdown,Trevone Boykin run for no gain Trevone Boykin ...,-11.546002,2014,TCU,0,1,7,3309,0.0,True,False,-7.0,FUMBLE RETURN TD,TCU,TCU,Oklahoma,Okla,6.178864,99.0,1,10,1,-7.0,-13.178864
686948,400934512102885711,Texas Tech,Big 12,Arizona State,Pac-12,Texas Tech,Arizona State,21,10,40093451210,2,11,42,98,1,2,1,Fumble Recovery (Opponent),Desmond Nisby run for 1 yd Desmond Nisby fumbl...,,2017,Texas Tech,0,2,11,2502,0.693147,True,False,-7.0,FUMBLE,Texas Tech,TexTc,Arizona State,ArzSt,6.278721,1.0,1,1,1,-6.382686,-12.661407
125827,401020775102944002,Buffalo,Mid-American,Kent State,Mid-American,Buffalo,Kent State,27,0,4010207759,2,5,59,98,2,2,-2,Fumble Recovery (Opponent),Jaret Patterson run for a loss of 2 yards to t...,,2018,Buffalo,0,2,27,2159,0.693147,True,False,-0.0,FUMBLE,Buffalo,Buff,Kent State,KntSt,6.345551,2.0,1,10,1,-6.311297,-12.656849
377072,400941820103918505,UCF,American Athletic,East Carolina,American Athletic,UCF,East Carolina,42,14,40094182020,3,8,14,99,2,1,0,Fumble Recovery (Opponent),Taj McGowan run for no gain to the ECaro 1 Taj...,,2017,UCF,0,1,28,1394,0.0,True,False,2.0,FUMBLE,UCF,UCF,East Carolina,ECaro,6.376109,1.0,1,10,1,-6.278234,-12.654343
283466,400547999103907301,Michigan State,Big Ten,Rutgers,Big Ten,Michigan State,Rutgers,35,0,40054799914,3,9,26,99,1,1,-2,Fumble Recovery (Opponent),Nick Hill run for a loss of 2 yards to the Rut...,,2014,Michigan State,0,1,35,1466,0.0,True,False,-0.0,FUMBLE,Michigan State,MchSt,Rutgers,Rutgr,6.502301,3.0,1,10,1,-6.144772,-12.647074


In [64]:
# regular_play['passing_player_name'] = np.nan
# regular_play['receiving_player_name'] = np.nan
# regular_play['rushing_player_name'] = np.nan
# regular_play['pass_rush_player_name_1'] = np.nan
# # regular_play['pass_rush_player_name_2'] = np.nan
# regular_play['force_fumble_player'] = np.nan
# regular_play['sacked_player_name'] = np.nan
# regular_play['intecept_player_name'] = np.nan
# regular_play['deflect_player_name'] = np.nan

In [65]:

# pass_play_type = ['slant','screen','deep','middle','sideline','crossing']

In [66]:
# regular_play.loc[regular_play.play_text.str.contains(' run for ') ,'rushing_player_name'] = regular_play.play_text.str.split(' run for ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' rush ') ,'rushing_player_name'] = regular_play.play_text.str.split(' rush ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' pass ') ,'passing_player_name'] =  regular_play.play_text.str.split(' pass ').str[0].str.split('(crossing|screen|sideline|middle|deep|slant)').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' sacked by ') ,'sacked_player_name'] = regular_play.play_text.str.split(' sacked by ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' sacked by ') ,'pass_rush_player_name_1'] = regular_play.play_text.str.split(' sacked by ').str[1].str.split(' for ').str[0].str.split(' and ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' sacked by ') & regular_play.play_text.str.contains(' and '),'pass_rush_player_name_2'] = regular_play.play_text.str.split(' and ').str[1].str.split(' for ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' pass complete to ') ,'receiving_player_name'] = regular_play.play_text.str.split(' pass complete to ').str[1].str.split(' for ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' pass incomplete to ') ,'receiving_player_name'] = regular_play.play_text.str.split(' pass incomplete to ').str[1].str.split(', broken up').str[0].str.replace(r'\b\.$', '', regex=True).str.strip().str.split(', hurried by ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' forced by ') ,'force_fumble_player'] = regular_play.play_text.str.split(' forced by ').str[1].str.split(', ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' pass intercepted ') & ~regular_play.play_text.str.contains(' for a TD '),'intecept_player_name'] = regular_play.play_text.str.split(' pass intercepted ').str[1].str.split(' return ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' pass intercepted ') & ~regular_play.play_text.str.contains(' for a TD ') & regular_play.play_text.str.contains(' at the '),'intecept_player_name'] = regular_play.intecept_player_name.str.split('by ').str[1].str.split(' at the ').str[0]
# regular_play.loc[regular_play.play_text.str.contains(' broken up by '), 'deflect_player_name'] = regular_play.play_text.str.split('broken up by ').str[1].str.split('.')[0]
# try:
#     regular_play.loc[regular_play.play_text.str.contains(' pass intercepted for a TD ') ,'intecept_play_name'] = regular_play.play_text.str.split(' pass intercepted for a TD ').str[1].str.split(' return ').str[0]
# except  AttributeError:
#     pass

In [67]:
# regular_play.to_csv('CFB_regular_play_19.csv')

In [68]:
import coremltools

y_end, X_end = dmatrices(reg_equation, out_df, return_type='dataframe')

coreml_model = coremltools.converters.sklearn.convert(clf, X_end.columns.to_list(), "drive_point")
coreml_model


input {
  name: "Intercept"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.2]"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.3]"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.4]"
  type {
    doubleType {
    }
  }
}
input {
  name: "distance"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.2]:distance"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.3]:distance"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.4]:distance"
  type {
    doubleType {
    }
  }
}
input {
  name: "adjusted_yardline"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.2]:adjusted_yardline"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.3]:adjusted_yardline"
  type {
    doubleType {
    }
  }
}
input {
  name: "C(down)[T.4]:adjusted_yardline"
  type {
    doubleType {
    }
  }
}
input {
  name: "period"
  type {
    doubleType {
    }
  }
}
input {
  name: "margin"
  typ

In [70]:
# Set model metadata
coreml_model.author = 'Akshay Easwaran, @spfleming and @903124 on Github'
coreml_model.license = 'MIT'
coreml_model.short_description = 'Projects the expected point value of a football play based on its down, distance, distance from goal, quarter, and scoring margin.'

# Set feature descriptions manually
coreml_model.input_description['Intercept'] = 'Constant of 1'
coreml_model.input_description['C(down)[T.2]'] = '1 when on 2nd down'
coreml_model.input_description['C(down)[T.3]'] = '1 when on 3rd down'
coreml_model.input_description['C(down)[T.4]'] = '1 when on 4th down'
coreml_model.input_description['distance'] = 'distance to 1st down'
coreml_model.input_description['C(down)[T.2]:distance'] = 'distance to 1st down when on 2nd down'
coreml_model.input_description['C(down)[T.3]:distance'] = 'distance to 1st down when on 3rd down'
coreml_model.input_description['C(down)[T.4]:distance'] = 'distance to 1st down when on 4th down'
coreml_model.input_description['adjusted_yardline'] = 'distance to end zone'
coreml_model.input_description['C(down)[T.2]:adjusted_yardline'] = 'distance to end zone when on 2nd down'
coreml_model.input_description['C(down)[T.3]:adjusted_yardline'] = 'distance to end zone when on 3rd down'
coreml_model.input_description['C(down)[T.4]:adjusted_yardline'] = 'distance to end zone when on 4th down'
coreml_model.input_description['period'] = 'current game quarter'
coreml_model.input_description['margin'] = 'scoring margin from perspective of offense'
# model.input_description['time_remaining'] = 'second'

# Set the output descriptions
coreml_model.output_description['drive_point'] = 'Point value of the current offensive drive'

# Save the model
coreml_model.save('CFBEPA.mlmodel')

In [None]:
# pd.merge(out_df, X_end, left_index=True, right_index=True)