In [1]:
# note: old version of sklearn in conda environment

import sklearn
sklearn.__version__

'0.23.0'

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from sklearn.ensemble import GradientBoostingClassifier
import joblib
# import statsmodels.api as sm
# import statsmodels.formula.api as smf
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

# first one here is from https://github.com/meysubb/cfbscrapR-MISC/blob/master/EPA_WPA/02-EPA-Model.R#L245
reg_equation_cfb = 'next_drive_point ~ time_remaining + adjusted_yardline + C(down) + log_distance + goal_to_go + under_two + log_distance*C(down) + adjusted_yardline*C(down) + goal_to_go*log_distance'
reg_equation_spl = 'drive_point ~ C(down) + distance + adjusted_yardline + C(down):distance + C(down):adjusted_yardline + period + margin'
reg_equation_ake = 'drive_point ~ C(down) + distance + adjusted_yardline + C(down):distance + C(down):adjusted_yardline + period + margin + time_remaining'
reg_equation = reg_equation_cfb




In [3]:
drive_data = pd.DataFrame()
game_data = pd.DataFrame()
play_data = pd.DataFrame()

# downloaded files from collegefootballdata.com
def retrieveCfbDataFile(endpoint, year):
    return pd.read_csv(f"data/{endpoint}/{year}.csv", encoding='latin-1')

for i in range(2012, 2020):
    drive = retrieveCfbDataFile('drives',i)
    drive['year'] = i
    drive_data = drive_data.append(drive, sort=False)
    
    gm = retrieveCfbDataFile('games',i)
    gm['year'] = i
    game_data = game_data.append(gm, sort=False)
    
    plys = retrieveCfbDataFile('pbp',i)
    plys['year'] = i
    plys["play_id"] = plys.id.astype(int)
    plys.id = plys.id.astype(int)
    play_data = play_data.append(plys, sort=False)

print(f"Total Games: {len(game_data)}")
print(f"Total Drives: {len(drive_data)}")
print(f"Total Plays: {len(play_data)}")

Total Games: 6644
Total Drives: 171692
Total Plays: 1210147


In [4]:
# drop FCS games 
play_data = play_data[(play_data['offense_conference'].notna()) & (play_data['defense_conference'].notna())]
print(f"Total FBS Plays: {len(play_data)}")

Total FBS Plays: 1055257


In [5]:
game_data['game_id'] = game_data['id']
data = pd.merge(drive_data,game_data,on='game_id')
data['drive_id'] = data['id_x']

In [6]:
pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')

In [7]:
pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)
pbp_data['adjusted_yardline'] = 100*(1-pbp_data['coef']) +  (2*pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API
pbp_data['margin'] = pbp_data['offense_score'] - pbp_data['defense_score']
pbp_data.loc[pbp_data.down > 4, 'down'] = 4
pbp_data.loc[pbp_data.period > 4, 'period'] = 5
pbp_data = pbp_data.sort_values(by="play_id", ascending=True)

In [8]:
# from cfbscrapR
pbp_data["clock.minutes"] = pbp_data.apply(lambda x: 15 + x["clock.minutes"] if (x.period in [1,3]) else x["clock.minutes"], axis=1)
pbp_data["time_remaining"] = pbp_data.apply(lambda x: (60 * x["clock.minutes"] + x["clock.seconds"]) if (x.period <= 4) else 0, axis=1)
pbp_data["log_distance"] = pbp_data.distance.apply(lambda x: np.log(0.5) if x == 0 else np.log(x))
pbp_data["goal_to_go"] = pbp_data.apply(lambda x: (x.distance >= (x.adjusted_yardline - 17)) if ("Field Goal" in x.play_type) else (x.distance >= x.adjusted_yardline), axis=1)
pbp_data["under_two"] = pbp_data.time_remaining.apply(lambda x: x <= 120)

pbp_data["abs_diff"] = abs(pbp_data.offense_score - pbp_data.defense_score)
pbp_data["ScoreDiff_W"] = (max(pbp_data.abs_diff) - pbp_data.abs_diff) / (max(pbp_data.abs_diff) - min(pbp_data.abs_diff))



In [9]:
data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x ==   'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'FUMBLE TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH'  or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD' or x == 'DOWNS TD') else 0 )))

In [10]:
print(data.columns)

Index(['offense', 'offense_conference', 'defense', 'defense_conference',
       'game_id', 'id_x', 'scoring', 'start_period', 'start_yardline',
       'start_time.minutes', 'start_time.seconds', 'end_period',
       'end_yardline', 'end_time.minutes', 'end_time.seconds',
       'elapsed.minutes', 'elapsed.seconds', 'plays', 'yards', 'drive_result',
       'year_x', 'id_y', 'season', 'week', 'season_type', 'start_date',
       'neutral_site', 'conference_game', 'attendance', 'venue_id', 'venue',
       'home_team', 'home_conference', 'home_points', 'home_line_scores[0]',
       'home_line_scores[1]', 'home_line_scores[2]', 'home_line_scores[3]',
       'home_post_win_prob', 'away_team', 'away_conference', 'away_points',
       'away_line_scores[0]', 'away_line_scores[1]', 'away_line_scores[2]',
       'away_line_scores[3]', 'away_post_win_prob', 'year_y', 'drive_id',
       'drive_point'],
      dtype='object')


In [11]:
def find_game_next_score_half(drive_df):
    drive_df.drive_id = drive_df.drive_id.astype(int)
    drive_df = drive_df.sort_values(by="drive_id", ascending=True).reset_index()
    score_plays = drive_df[
        (drive_df.scoring == True)
        & (~drive_df.drive_result.str.contains("END OF"))
    ].drop_duplicates('drive_id').index.to_series()
    
    final_df = pd.DataFrame()
    for x in range(0, len(drive_df)):
        tmp_df = find_next_score(x, score_plays, drive_df)
        final_df = final_df.append(tmp_df)
#     print(final_df)
    
    final_df2 = pd.merge(drive_df, final_df, left_index=True, right_index=True)
    final_df2['Drive_Score_Dist'] = final_df2.DSH - final_df2.drive_id.astype(int)
    final_df2['Drive_Score_Dist_W'] = (max(final_df2.Drive_Score_Dist) - final_df2.Drive_Score_Dist) / (max(final_df2.Drive_Score_Dist) - min(final_df2.Drive_Score_Dist))
    return final_df2

def find_next_score(play_i, score_plays, dat_drive):
    defense_tds = ["FUMBLE RETURN TD", "FUMBLE TD", "INT RETURN TOUCH", ""]
    next_score_i = None
    try:
        next_score_i = score_plays[
            score_plays >= play_i
        ].iloc[0]
    except IndexError:
        next_score_i = None
        
#     print(f"Checking play index: {play_i}")
#     print(f"Checking next score index: {next_score_i}")
    try:
        test = dat_drive.iloc[play_i]
    except IndexError:
        print(f"play_i {play_i} out of bounds in dat_drive (size: {len(dat_drive)})")
        
    if (next_score_i != None):
        try:
            test = dat_drive.iloc[next_score_i]
        except IndexError:
            print(f"next_score_i {next_score_i} out of bounds in dat_drive (size: {len(dat_drive)})")
        
        
    if ((next_score_i == None) 
        or (dat_drive.iloc[play_i].start_period <= 2 and dat_drive.iloc[next_score_i].start_period in [3,4])
       or (dat_drive.iloc[play_i].start_period in [3,4] and dat_drive.iloc[next_score_i].start_period > 4)):
        score_drive = dat_drive.iloc[play_i].drive_id
        next_score = 0
        return pd.DataFrame({"NSH" : next_score, "DSH" : score_drive}, index=[play_i])
    else:
        next_score = 0
        score_drive = dat_drive.iloc[next_score_i].drive_id
        current_team = dat_drive.iloc[play_i].offense
        
        next_score_team = dat_drive.iloc[next_score_i].offense
        if (dat_drive.iloc[next_score_i].drive_result in defense_tds):
            next_score_team = dat_drive.iloc[next_score_i].defense
        
        if ("RETURN TD" in dat_drive.iloc[next_score_i].drive_result):
            if (current_team == next_score_team):
                next_score = -1 * dat_drive.iloc[next_score_i].drive_point
            else:
                next_score = dat_drive.iloc[next_score_i].drive_point
        else:
            if (current_team == next_score_team):
                next_score = dat_drive.iloc[next_score_i].drive_point
            else:
                next_score = -1 * dat_drive.iloc[next_score_i].drive_point
        return pd.DataFrame({"NSH" : next_score, "DSH" : score_drive}, index=[play_i])
    
data = find_game_next_score_half(data)
# data19

In [12]:
print(len(data))
print(len(data[data.Drive_Score_Dist_W < 1.0]))
print(len(data[data.NSH == -3]))

171692
97359
17557


In [13]:
# data.loc[data.drive_point == 0, 'drive_point'] = data['NSH']

In [14]:
drive_stuff = data[['drive_id','game_id','DSH','drive_point','drive_result','NSH','Drive_Score_Dist_W']]
drive_stuff.columns = ['drive_id','game_id','DSH','drive_point','drive_result','next_drive_point','Drive_Score_Dist_W']

pbp_data = pd.merge(pbp_data, drive_stuff, left_on='drive_id',right_on='drive_id')
pbp_data['Total_W'] = pbp_data.Drive_Score_Dist_W + pbp_data.ScoreDiff_W
pbp_data['Total_W_Scaled'] = (pbp_data.Total_W - min(pbp_data.Total_W)) / (max(pbp_data.Total_W) - min(pbp_data.Total_W))
pbp_data.head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled
0,4005478320,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,14,21,40054783210,2,2,14,39,2,10,15,Penalty,"ARMY Penalty, personal foul (N/A) to the WFrst...",,2014,4005478320,Wake Forest,0,61,-7,134,2.302585,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364
1,400547832102977603,Army,FBS Independents,Wake Forest,ACC,Wake Forest,Army,21,14,40054783210,2,2,23,65,1,0,38,Kickoff,"Daniel Grochowski kickoff for 64 yds , Tyler H...",,2014,400547832102977603,Wake Forest,1,65,7,143,-0.693147,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364
2,400547832102978501,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,14,21,40054783210,2,2,14,39,1,10,0,Pass Incompletion,John Wolford pass incomplete to E.J. Scott,-1.107768,2014,400547832102978501,Wake Forest,0,61,-7,134,2.302585,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364
3,4005478910,Texas,Big 12,Oklahoma State,Big 12,Oklahoma State,Texas,13,0,4005478915,1,23,7,65,1,65,10,Penalty,"OKLAHOMA ST Penalty, illegal block (10 Yards) ...",,2014,4005478910,Oklahoma State,1,65,13,1387,4.174387,True,False,13,0.843373,400547891,4005478916,0,PUNT,-3,1.0,1.843373,0.881818
4,400547891101976604,Oklahoma State,Big 12,Texas,Big 12,Oklahoma State,Texas,0,13,4005478915,1,17,33,9,1,10,21,Pass Reception,Daxx Garman pass complete to Brandon Sheperd f...,1.129399,2014,400547891101976604,Oklahoma State,0,91,-13,1053,2.302585,False,False,13,0.843373,400547891,4005478916,0,PUNT,-3,1.0,1.843373,0.881818


In [15]:
exclude_playtype = [
#     'Kickoff',  
#     'End Period',
#     'Kickoff Return (Offense)',
#     'Kickoff Return Touchdown', 
#     'End of Half', 
#     'Defensive 2pt Conversion',
#     'Uncategorized', 
#     'End of Game', 
#     'Timeout',
#     'placeholder'
    "Extra Point Missed",
    "Extra Point Good",
    "Timeout",
    "End of Half",
    "End of Game",
    "Uncategorized",
    #"Kickoff",
    "Penalty"
    #"Kickoff Return (Offense)",
    #"Kickoff Return Touchdown"
]

# game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)',"END OF 4TH QUARTER"]

regression_df = pbp_data[
    (pbp_data.down > 0)
    & (pbp_data.year >= 2014)
    & ~(pbp_data.play_type.isin(exclude_playtype)) 
    & (pbp_data.down.notna())
    & (pbp_data.time_remaining.notna())
    & (pbp_data.game_id.notna())
    & (pbp_data.log_distance.notna() & pbp_data.log_distance != -float('inf'))
    & (pbp_data.game_id != 400603838)
].dropna()

In [16]:
from patsy import dmatrices
# int_conv = regression_df.astype({"drive_point":int, "down": int, "distance":int, "adjusted_yardline": int, "period": int, "margin": int, "time_remaining": int})
y, X = dmatrices(reg_equation, regression_df, return_type='dataframe')
X.columns

Index(['Intercept', 'C(down)[T.2]', 'C(down)[T.3]', 'C(down)[T.4]',
       'goal_to_go[T.True]', 'under_two[T.True]', 'time_remaining',
       'adjusted_yardline', 'adjusted_yardline:C(down)[T.2]',
       'adjusted_yardline:C(down)[T.3]', 'adjusted_yardline:C(down)[T.4]',
       'log_distance', 'log_distance:C(down)[T.2]',
       'log_distance:C(down)[T.3]', 'log_distance:C(down)[T.4]',
       'goal_to_go[T.True]:log_distance'],
      dtype='object')

In [17]:
# feature_cols = ["Intercept", "C(down)[T.2]", "C(down)[T.3]", "C(down)[T.4]", "distance", "C(down)[T.2]:distance", "C(down)[T.3]:distance", "C(down)[T.4]:distance", "adjusted_yardline", "C(down)[T.2]:adjusted_yardline", "C(down)[T.3]:adjusted_yardline", "C(down)[T.4]:adjusted_yardline", "period", "margin"]
for c in X.columns:
    X[c] = X[c].astype(int)
y.next_drive_point = y.next_drive_point.astype(int)


In [18]:
X.columns = ['Intercept', 'down2', 'down3', 'down4',
       'goal_to_go', 'under_two', 'time_remaining',
       'adjusted_yardline', 'adjusted_yardline_down_2',
       'adjusted_yardline_down_3', 'adjusted_yardline_down_4',
       'log_distance', 'log_distance_down_2',
       'log_distance_down_3', 'log_distance_down_4',
       'goal_to_go_log_distance']

In [19]:
print("Base drive point\n",y.next_drive_point.value_counts())

Base drive point
  7    125425
-7     78990
 3     46858
-3     27196
 0     15874
 2       929
-2       569
Name: next_drive_point, dtype: int64


In [20]:
class_to_score_mapping = {
    0: 7,
    1: -7,
    2: 3,
    3: -3,
    4: 0,
    5: 2,
    6: -2
}
score_to_class_mapping = {
    7: 0,
    -7: 1,
    3: 2,
    -3: 3,
    0: 4,
    2: 5,
    -2: 6
}
class_to_name_mapping = {
    0: "TD",
    1: "Opp_TD",
    2: "FG",
    3: "Opp_FG",
    4: "No_Score",
    5: "Safety",
    6: "Opp_Safety"
}
score_to_name_mapping = {
    7: "TD",
    -7: "Opp_TD",
    3: "FG",
    -3: "Opp_FG",
    0: "No_Score",
    2: "Safety",
    -2: "Opp_Safety"
}
y.next_drive_point = y.next_drive_point.apply(lambda x: score_to_class_mapping[x])
print("Mapped drive point\n",y.next_drive_point.value_counts())

Mapped drive point
 0    125425
1     78990
2     46858
3     27196
4     15874
5       929
6       569
Name: next_drive_point, dtype: int64


In [21]:
nrounds = 300
params = {
    'objective': 'multi:softprob',
    'booster' : 'gbtree',
    'eval_metric' : ['logloss'],
    "num_class" : y.next_drive_point.nunique(),
    "eta": 0.025,
    "gamma": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "max_depth": 5,
    "min_child_weight": 1
}

dtrain = xgb.DMatrix(X, weight=regression_df.Total_W_Scaled, label=y.next_drive_point)

xgb_model = xgb.train(params, dtrain, num_boost_round=nrounds, verbose_eval=2)
# save for debug
xgb_model.dump_model('xgb_dump.json', with_stats=True, dump_format='json')

In [22]:
special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal',
                          'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']
timing_play_type = ['End Period','End of Game','Timeout','End of Half',"END OF GAME"]
turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception','Punt',
                     'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return', 'Missed Field Goal Return Touchdown']
regular_play_type = ['Pass', 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']
time_play = ['End Period','Timeout','End of Half','End of Game',"END OF GAME"]
PAT_miss_type= [ 'PAT MISSED','PAT failed', 'PAT blocked', 'PAT BLOCKED']
off_TD = ['Passing Touchdown','Rushing Touchdown']
def_TD = ['Interception Return Touchdown','Fumble Return Touchdown', 'Missed Field Goal Return Touchdown','Blocked Punt Touchdown','Punt Return Touchdown']

In [23]:
regular_play = pbp_data[~pbp_data.play_type.str.contains('Kickoff') & ~(pbp_data.play_type.isin(time_play)) &(pbp_data.down > 0) & (pbp_data.distance > 0)]
# regular_play.to_csv("./regular_pbp.csv", index=False,encoding="utf8")
print(len(regular_play.Total_W_Scaled))
regular_play.head()

925438


Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled
0,4005478320,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,14,21,40054783210,2,2,14,39,2,10,15,Penalty,"ARMY Penalty, personal foul (N/A) to the WFrst...",,2014,4005478320,Wake Forest,0,61,-7,134,2.302585,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364
2,400547832102978501,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,14,21,40054783210,2,2,14,39,1,10,0,Pass Incompletion,John Wolford pass incomplete to E.J. Scott,-1.107768,2014,400547832102978501,Wake Forest,0,61,-7,134,2.302585,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364
3,4005478910,Texas,Big 12,Oklahoma State,Big 12,Oklahoma State,Texas,13,0,4005478915,1,23,7,65,1,65,10,Penalty,"OKLAHOMA ST Penalty, illegal block (10 Yards) ...",,2014,4005478910,Oklahoma State,1,65,13,1387,4.174387,True,False,13,0.843373,400547891,4005478916,0,PUNT,-3,1.0,1.843373,0.881818
4,400547891101976604,Oklahoma State,Big 12,Texas,Big 12,Oklahoma State,Texas,0,13,4005478915,1,17,33,9,1,10,21,Pass Reception,Daxx Garman pass complete to Brandon Sheperd f...,1.129399,2014,400547891101976604,Oklahoma State,0,91,-13,1053,2.302585,False,False,13,0.843373,400547891,4005478916,0,PUNT,-3,1.0,1.843373,0.881818
5,400547891101976605,Oklahoma State,Big 12,Texas,Big 12,Oklahoma State,Texas,0,13,4005478915,1,17,33,30,1,10,-15,Penalty,"Hill, Tyreek rush for 5 yards to the OSU35 (Ha...",,2014,400547891101976605,Oklahoma State,0,70,-13,1053,2.302585,False,False,13,0.843373,400547891,4005478916,0,PUNT,-3,1.0,1.843373,0.881818


In [24]:
CFB_teams_list = pd.read_csv('https://raw.githubusercontent.com/903124/CFB_EPA_data/master/.ipynb_checkpoints/cfb_teams_list-checkpoint.csv',encoding='utf-8')

In [25]:
CFB_teams_list.full_name.unique()

array(['Abilene Christian', 'Air Force', 'Akron', 'Alabama',
       'Alabama A&M', 'Albany', 'Alcorn State', 'Appalachian State',
       'Arizona', 'Arizona State', 'Arkansas', 'Arkansas State',
       'Arkansas-Pine Bluff', 'Army', 'Auburn', 'Austin Peay', 'BYU',
       'Ball State', 'Baylor', 'Bethune-Cookman', 'Boise State',
       'Boston College', 'Bowling Green', 'Buffalo', 'California',
       'Campbell', 'Central Arkansas', 'Central Connecticut',
       'Central Michigan', 'Charleston Southern', 'Charlotte',
       'Cincinnati', 'Clemson', 'Coastal Carolina', 'Colorado',
       'Colorado State', 'Connecticut', 'Delaware State', 'Drake', 'Duke',
       'Duquesne', 'East Carolina', 'Eastern Illinois',
       'Eastern Kentucky', 'Eastern Michigan', 'Eastern Washington',
       'Elon', 'Florida', 'Florida Atlantic', 'Florida International',
       'Florida State', 'Fordham', 'Fresno State', 'Gardner-Webb',
       'Georgia', 'Georgia Southern', 'Georgia State', 'Georgia Tech',
     

In [26]:
regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])
regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)
regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])
regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)

In [27]:
# there are some plays in prev seasons that have no drive_point set (and ended up being at the end of a game); this broke the produced dmatrices bc it drops NAs in the outcome var
# regular_play[regular_play.drive_point.isna() == True]
regular_play.loc[regular_play.drive_point.isna() == True, "drive_point"] = 0.0

invalid_types = ["Uncategorized", "placeholder"]

regular_play = regular_play[
    (regular_play.play_text.notna())
    & (regular_play.play_type.notna())
]
regular_play = regular_play[
    (regular_play.play_type != 'Penalty')
    & (regular_play.play_type != 'Interception') 
    & ~(regular_play.play_text.str.lower().str.contains('penalty')) 
    & ~(regular_play.play_type.str.lower().str.contains('uncategorized')) 
    & ~(regular_play.play_type.str.lower().str.contains('placeholder'))
]
regular_play.head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled,off_full_name,off_abbr,def_full_name,def_abbr
1,400547832102978501,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,14,21,40054783210,2,2,14,39,1,10,0,Pass Incompletion,John Wolford pass incomplete to E.J. Scott,-1.107768,2014,400547832102978501,Wake Forest,0,61,-7,134,2.302585,False,False,7,0.915663,400547832,40054783210,0,INT,0,1.0,1.915663,0.936364,Wake Forest,WFrst,Army,Army
2,322660154017,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,0,6,32266015402,1,26,35,18,1,10,6,Pass Completion,Tanner Price pass complete to Michael Campanar...,0.271467,2012,322660154017,Wake Forest,0,82,-6,1595,2.302585,False,False,6,0.927711,322660154,32266015402,7,RUSHING TD,7,1.0,1.927711,0.945455,Wake Forest,WFrst,Army,Army
3,322660154018,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,0,6,32266015402,1,25,56,24,2,4,0,Pass Incompletion,Tanner Price pass incomplete to Sherman Raglan...,-0.654563,2012,322660154018,Wake Forest,0,76,-6,1556,1.386294,False,False,6,0.927711,322660154,32266015402,7,RUSHING TD,7,1.0,1.927711,0.945455,Wake Forest,WFrst,Army,Army
4,322660154019,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,0,6,32266015402,1,25,45,24,3,4,47,Pass Completion,Tanner Price pass complete to Brandon Terry fo...,4.083093,2012,322660154019,Wake Forest,0,76,-6,1545,1.386294,False,False,6,0.927711,322660154,32266015402,7,RUSHING TD,7,1.0,1.927711,0.945455,Wake Forest,WFrst,Army,Army
5,322660154020,Wake Forest,ACC,Army,FBS Independents,Wake Forest,Army,0,6,32266015402,1,25,25,71,1,10,10,Rush,Michael Campanaro rush for 10 yards to the Arm...,,2012,322660154020,Wake Forest,0,29,-6,1525,2.302585,False,False,6,0.927711,322660154,32266015402,7,RUSHING TD,7,1.0,1.927711,0.945455,Wake Forest,WFrst,Army,Army


In [28]:
turnover_plays = [
    "Blocked Field Goal",
    "Blocked Field Goal Touchdown",    
    "Blocked Punt",
    "Blocked Punt Touchdown",
    "Field Goal Missed",
    "Missed Field Goal Return",
    "Missed Field Goal Return Touchdown",    
    "Fumble Recovery (Opponent)",
    "Fumble Recovery (Opponent) Touchdown",
    "Fumble Return Touchdown",
    "Fumble Return Touchdown Touchdown",
    "Defensive 2pt Conversion",
    "Pass Interception",
    "Interception",
    "Interception Return Touchdown",
    "Pass Interception Return",
    "Pass Interception Return Touchdown",
    "Punt",
    "Punt Return Touchdown",
    "Sack Touchdown",
    "Uncategorized Touchdown"
]

defense_score_vec = [
    "Blocked Punt Touchdown",
    "Blocked Field Goal Touchdown",
    "Missed Field Goal Return Touchdown",
    "Punt Return Touchdown",
    "Fumble Recovery (Opponent) Touchdown",    
    "Fumble Return Touchdown",
    "Fumble Return Touchdown Touchdown",
    "Defensive 2pt Conversion",
    "Safety",
    "Sack Touchdown",    
    "Interception Return Touchdown",
    "Pass Interception Return Touchdown",
    "Uncategorized Touchdown"
]

normal_play = [
    "Rush",
    "Pass",
    "Pass Completion",
    "Pass Reception",
    "Pass Incompletion",
    "Sack",
    "Fumble Recovery (Own)"
]

score = [
    "Passing Touchdown", 
    "Rushing Touchdown", 
    "Field Goal Good",
    "Pass Reception Touchdown",
    "Fumble Recovery (Own) Touchdown",
    "Punt Touchdown",
    "Rushing Touchdown Touchdown"         
]

kickoff = [
    "Kickoff",
    "Kickoff Return (Offense)",
    "Kickoff Return Touchdown",
    "Kickoff Touchdown"
]

def determine_new_down(row):
    if ((row.play_type in score) or (row.play_type in kickoff) or (row.play_type in turnover_plays) or (row.play_type in defense_score_vec)):
        return 1
    elif ((row.play_type in normal_play) & (row.yards_gained >= row.distance)):
        return 1
    elif ((row.play_type in normal_play) & (row.yards_gained < row.distance) & (row.down <= 3)):
        return row.down + 1
    elif ((row.play_type in normal_play) & (row.yards_gained < row.distance) & (row.down == 4)):
        return 1
    else:
        return None
    
def determine_new_distance(row):
    if ((row.play_type in normal_play) & (row.yards_gained >= row.distance) & (row.adjusted_yardline - row.yards_gained >= 10)):
        return 10
    elif ((row.play_type in normal_play) & (row.yards_gained >= row.distance) & (row.adjusted_yardline - row.yards_gained <= 10)):
        return row.adjusted_yardline
    elif ((row.play_type in normal_play) & (row.yards_gained < row.distance) & (row.down <= 3)):
        return row.distance - row.yards_gained
    elif ((row.play_type in normal_play) & (row.yards_gained < row.distance) & (row.down == 4) & (100 - (row.adjusted_yardline - row.yards_gained) > 10)):
        return 10
    elif ((row.play_type in normal_play) & (row.yards_gained < row.distance) & (row.down == 4) & (100 - (row.adjusted_yardline - row.yards_gained) <= 10)):
        return 100 - row.adjusted_yardline
    elif (row.play_type in turnover_plays):
        return 10
    elif (row.play_type in defense_score_vec):
        return 0
    elif (row.play_type in score):
        return 0
    elif (row.play_type in kickoff):
        return 10
    else:
        return None

def determine_new_yardline(row):
    if (row.play_type in normal_play):
        return row.adjusted_yardline - row.yards_gained
    elif (row.play_type in score):
        return 0
    elif (row.play_type in defense_score_vec):
        return 0
    elif (row.play_type in kickoff):
        return 75
    elif (row.play_type in turnover_plays):
        return 100 - row.adjusted_yardline + row.yards_gained
    else:
        return None

def determine_turnover_indicator(row):
    return 1 if ((row.play_type in turnover_plays) or (row.play_type in defense_score_vec) or ((row.play_type in normal_play) & row.yards_gained < row.distance & row.down == 4)) else 0

import re
def determine_punt_yardline(row):
    try:
        yds_punted = re.search('(?<=for)[^,]+ (\d+)', row.play_text).group(1).astype(float)
    except:
        yds_punted = 0
    return 100 - ((row.yard_line - yds_punted) if (row.yard_line > 50) else (row.yard_line + yds_punted))

def prep_end_vars(dat):
    print("starting basic setup of end vars")
    dat.play_id = dat.play_id.astype(int)
    dat.id = dat.id.astype(int)
    dat = dat.sort_values(by=["game_id","play_id"], ascending=True)
    dat['half'] = dat.period.apply(lambda x: 1 if (x <= 2) else 2)
    dat['new_yardline'] = 0
    dat['new_down'] = 0
    dat['new_distance'] = 0
    dat['turnover_end'] = False
    dat['next_offense'] = dat.offense.shift(-1)

    dat.loc[((dat.play_type.isin(turnover_plays)) | (dat.next_offense != dat.offense)), "turnover_end"] = True
    dat['turnover_indicator'] = dat.apply(lambda x: determine_turnover_indicator(x), axis = 1)
    dat.loc[dat.play_type.str.contains("Kickoff"), 'down'] = 5
    dat['new_down'] = dat.apply(lambda row: determine_new_down(row), axis=1)
    dat['new_distance'] = dat.apply(lambda row: determine_new_distance(row), axis=1)
    dat['new_yardline'] = dat.apply(lambda row: determine_new_yardline(row), axis=1)
    dat['new_time_remaining'] = dat.time_remaining.shift(-1)
    dat.loc[dat.new_time_remaining.isna(), 'new_time_remaining'] = 0
    dat['new_log_distance'] = dat.new_distance.apply(lambda x: np.log(0.5) if x == 0 else np.log(x)) #np.log(0.5) if (dat.new_distance == 0) else np.log(dat.new_distance)
    dat['new_goal_to_go'] = (dat.new_yardline <= dat.new_distance)
    dat['new_under_two'] = (dat.new_time_remaining <= 120)
    dat["end_half_game"] = False
    print("done with basic setup")
    
    print("Updating punt stuff")
#     is_punt = dat.play_type.str.contains("Punt")
#     is_punt_touchback = is_punt & (dat.play_text.str.contains("touchback"))
#     yds_gained_more_0 = (dat.yards_gained > 0) & is_punt
    dat.loc[dat.play_type.str.contains("Punt"), "new_down"] = 1
    dat.loc[dat.play_type.str.contains("Punt"), "new_distance"] = 10
    dat.loc[dat.play_type.str.contains("Punt"), "new_log_distance"] = np.log(10)
    dat.loc[dat.play_type.str.contains("Punt"), "new_goal_to_go"] = False
    dat.loc[(dat.play_type.str.contains("Punt") & (dat.play_text.str.contains("touchback"))), "new_yardline"] = 80
    dat.loc[(dat.play_type.str.contains("Punt") & (dat.yards_gained > 0)), "new_yardline"] = 100 - (dat.adjusted_yardline - dat.yards_gained)
    
    dat.loc[(dat.yards_gained == 0) & (dat.play_type.str.contains("Punt")) & ~((dat.play_type.str.contains("Punt")) & (dat.play_text.str.contains("touchback"))), "new_yardline"] = dat.apply(lambda x: determine_punt_yardline(x), axis=1)

    print("Updating end of half stuff")
#     end_of_half_plays = (dat.new_time_remaining == 0)
#     if (end_of_half_plays.any()):
    dat.loc[(dat.new_time_remaining == 0), "new_yardline"] = 99
    dat.loc[(dat.new_time_remaining == 0), "new_down"] = 4
    dat.loc[(dat.new_time_remaining == 0), "new_distance"] = 99
    dat.loc[(dat.new_time_remaining == 0), "end_half_game"] = 1
    dat.loc[(dat.new_time_remaining == 0), "new_log_distance"] = np.log(99)
#         dat[end_of_half_plays, "new_goal_to_go"] = False
    dat.loc[(dat.new_time_remaining == 0), "new_under_two"] = (dat.new_time_remaining <= 120)
    dat.loc[(dat.new_time_remaining == 0), "end_half_game"] = True
    
    print("Fixing misc stuff with bad values")
#     na_yd_line = dat[(dat.new_yardline.isna()) or (dat.new_yardline >= 100)]
    dat.loc[(dat.new_yardline.isna()) | (dat.new_yardline >= 100), "new_yardline"] = dat.new_yardline.shift(-1)
    
#     neg_distance = dat[(dat.new_distance < 0)]
    dat.loc[(dat.new_distance < 0), "new_distance"] = dat.new_distance.shift(-1)
    dat.loc[(dat.new_distance < 0), "new_log_distance"] = dat.new_distance.apply(lambda x: np.log(0.5) if x == 0 else np.log(x))
    
#     missing_yd_line = (dat.new_yardline == 0) or (dat.new_yardline.isna())
    dat.loc[(dat.new_yardline == 0) | (dat.new_yardline.isna()), "new_yardline"] = 99
    dat.loc[(dat.new_yardline == 0) | (dat.new_yardline.isna()), "new_log_distance"] = np.log(99)
    
    print("Done!")
    return dat
#     return dat[["play_id","game_id", "drive_id", "new_time_remaining","new_down","new_distance","new_yardline","new_log_distance","new_goal_to_go","new_under_two","end_half_game","turnover","Total_W_Scaled"]]    

In [29]:
regular_play = prep_end_vars(regular_play)
regular_play #= pd.merge(regular_play, base_end, left_on="play_id", right_on="play_id")

starting basic setup of end vars
done with basic setup
Updating punt stuff
Updating end of half stuff
Fixing misc stuff with bad values
Done!


Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled,off_full_name,off_abbr,def_full_name,def_abbr,half,new_yardline,new_down,new_distance,turnover_end,next_offense,turnover_indicator,new_time_remaining,new_log_distance,new_goal_to_go,new_under_two,end_half_game
657094,322430041002,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,45,80,1,10,1,Rush,Chris Burns rush for 1 yard to the UMass 21.,-0.468828,2012,322430041002,Connecticut,1,80,0,1785,2.302585,False,False,0,1.000000,322430041,32243004102,0,PUNT,-7,1.0,2.000000,1.000000,UMass,UMass,Connecticut,UConn,1,79.0,2,9,False,UMass,0,1760.0,2.197225,False,False,False
657095,322430041003,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,20,79,2,9,0,Pass Incompletion,Mike Wegzyn pass incomplete.,-0.310093,2012,322430041003,Connecticut,1,79,0,1760,2.197225,False,False,0,1.000000,322430041,32243004102,0,PUNT,-7,1.0,2.000000,1.000000,UMass,UMass,Connecticut,UConn,1,79.0,3,9,False,UMass,0,1750.0,2.197225,False,False,False
657096,322430041005,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,10,79,3,9,5,Pass Completion,Mike Wegzyn pass complete to Marken Michel for...,0.019784,2012,322430041005,Connecticut,1,79,0,1750,2.197225,False,False,0,1.000000,322430041,32243004102,0,PUNT,-7,1.0,2.000000,1.000000,UMass,UMass,Connecticut,UConn,1,74.0,4,4,False,UMass,0,1708.0,1.386294,False,False,False
657097,322430041006,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,28,28,74,4,4,-2,Punt,"Jeff Strait punt for 47 yards, returned by Nic...",,2012,322430041006,Connecticut,1,74,0,1708,1.386294,False,False,0,1.000000,322430041,32243004102,0,PUNT,-7,1.0,2.000000,1.000000,UMass,UMass,Connecticut,UConn,1,24.0,1,10,True,Connecticut,1,1700.0,2.302585,False,False,False
570576,322430041008,Connecticut,Big East,UMass,Mid-American,Connecticut,UMass,0,0,32243004102,1,28,20,25,1,10,5,Rush,Lyle McCombs rush for 5 yards to the UConn 30.,0.094545,2012,322430041008,Connecticut,0,75,0,1700,2.302585,False,False,0,1.000000,322430041,32243004102,7,RUSHING TD,7,1.0,2.000000,1.000000,Connecticut,UConn,UMass,UMass,1,70.0,2,5,False,Connecticut,0,1666.0,1.609438,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540574,401183793104925609,New Mexico,Mountain West,Air Force,Mountain West,New Mexico,Air Force,16,44,40118379316,4,7,43,66,1,10,7,Rush,Chad Alexander run for 7 yds to the AFA 27,0.384582,2019,401183793104925609,New Mexico,0,34,-28,463,2.302585,False,False,28,0.662651,401183793,40118379316,7,TD,7,1.0,1.662651,0.745455,New Mexico,NMex,Air Force,AFA,2,27.0,2,3,False,New Mexico,0,117.0,1.098612,False,True,False
540575,401183793104984201,New Mexico,Mountain West,Air Force,Mountain West,New Mexico,Air Force,22,44,40118379316,4,1,57,73,2,3,27,Passing Touchdown,Trae Hall pass complete to Aaron Molina for 27...,2.552144,2019,401183793104984201,New Mexico,0,27,-22,117,1.098612,False,True,22,0.734940,401183793,40118379316,7,TD,7,1.0,1.734940,0.800000,New Mexico,NMex,Air Force,AFA,2,99.0,1,0,True,Air Force,0,117.0,-0.693147,True,True,False
182964,401183793104984204,Air Force,Mountain West,New Mexico,Mountain West,New Mexico,Air Force,44,22,40118379317,4,1,57,75,1,10,6,Rush,Mike Schmidt run for 6 yds to the AFA 31,0.333375,2019,401183793104984204,New Mexico,1,75,22,117,2.302585,False,True,22,0.734940,401183793,40118379317,0,END OF GAME,0,1.0,1.734940,0.800000,Air Force,AFA,New Mexico,NMex,2,69.0,2,4,False,Air Force,0,117.0,1.386294,False,True,False
182965,401183793104984205,Air Force,Mountain West,New Mexico,Mountain West,New Mexico,Air Force,44,22,40118379317,4,1,57,69,2,4,-2,Rush,TEAM run for a loss of 2 yards to the AFA 29,-1.098568,2019,401183793104984205,New Mexico,1,69,22,117,1.386294,False,True,22,0.734940,401183793,40118379317,0,END OF GAME,0,1.0,1.734940,0.800000,Air Force,AFA,New Mexico,NMex,2,71.0,3,6,False,Air Force,0,117.0,1.791759,False,True,False


In [30]:
len(regular_play.Total_W_Scaled)

859583

In [31]:
# regular_play[(regular_play.yards_gained == 0) & (regular_play.play_type.str.contains("Punt")) & ~((regular_play.play_type.str.contains("Punt")) & (regular_play.play_text.str.contains("touchback")))].new_yardline = 100 if (punt_yd_line > 50) else 


In [32]:
y_test, X_test = dmatrices(reg_equation, regular_play, return_type='dataframe')
X_test.columns = ['Intercept', 'down2', 'down3', 'down4',
       'goal_to_go', 'under_two', 'time_remaining',
       'adjusted_yardline', 'adjusted_yardline_down_2',
       'adjusted_yardline_down_3', 'adjusted_yardline_down_4',
       'log_distance', 'log_distance_down_2',
       'log_distance_down_3', 'log_distance_down_4',
       'goal_to_go_log_distance']

dtest = xgb.DMatrix(X_test, weight=regular_play.Total_W_Scaled, label=y_test)
EP_predict_start = xgb_model.predict(dtest)
EP_predict_start

EP_start = EP_predict_start[:,0] * class_to_score_mapping[0] + EP_predict_start[:,1] * class_to_score_mapping[1] + EP_predict_start[:,2] * class_to_score_mapping[2] + EP_predict_start[:,3] * class_to_score_mapping[3] + EP_predict_start[:,4] * class_to_score_mapping[4] + EP_predict_start[:,5] * class_to_score_mapping[5] + EP_predict_start[:,6] * class_to_score_mapping[6]
# print(len(EP))
regular_play['EP_start'] = EP_start

for c in range(0, 7):
    regular_play[class_to_name_mapping[c]] = EP_predict_start[:,c]
    

In [33]:
out_df = pd.DataFrame({'play_type':regular_play['play_type'],'play_text':regular_play['play_text'],'yards_gained':regular_play['yards_gained'], 'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline'],'margin':regular_play['margin'], 'period':regular_play['period'],'drive_point':regular_play['drive_point'],'next_drive_point':regular_play['next_drive_point'], 'log_distance':regular_play['new_log_distance'],'goal_to_go':regular_play['new_goal_to_go'],'time_remaining':regular_play['new_time_remaining'], 'under_two':regular_play['new_under_two'], "Total_W_Scaled" : regular_play.Total_W_Scaled})
out_df[out_df.down.isna()].play_type.value_counts()

Series([], Name: play_type, dtype: int64)

In [34]:
y_end, X_end = dmatrices(reg_equation, out_df, return_type='dataframe')

X_end.columns = ['Intercept', 'down2', 'down3', 'down4',
       'goal_to_go', 'under_two', 'time_remaining',
       'adjusted_yardline', 'adjusted_yardline_down_2',
       'adjusted_yardline_down_3', 'adjusted_yardline_down_4',
       'log_distance', 'log_distance_down_2',
       'log_distance_down_3', 'log_distance_down_4',
       'goal_to_go_log_distance']
len(X_end)

859583

In [35]:
dtest_end = xgb.DMatrix(X_end, weight=out_df.Total_W_Scaled, label=y_end)
EP_predict_end = xgb_model.predict(dtest_end)

In [36]:
EP_end = EP_predict_end[:,0] * class_to_score_mapping[0] + EP_predict_end[:,1] * class_to_score_mapping[1] + EP_predict_end[:,2] * class_to_score_mapping[2] + EP_predict_end[:,3] * class_to_score_mapping[3] + EP_predict_end[:,4] * class_to_score_mapping[4] + EP_predict_end[:,5] * class_to_score_mapping[5] + EP_predict_end[:,6] * class_to_score_mapping[6]

regular_play['EP_end'] = EP_end

In [37]:
# turnover_ps = (regular_play.turnover_end == True) and ~(regular_play.play_type == 'Kickoff')
regular_play.loc[(regular_play.turnover_end == True), "EP_end"] = -1 * regular_play.EP_end

regular_play.loc[(regular_play.end_half_game == True), "EP_end"] = 0

In [38]:
regular_play.loc[(regular_play.play_type.isin(off_TD)),'EP_end'] = 7
regular_play.loc[(regular_play.play_type.isin(def_TD)),'EP_end'] = -7
regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2
regular_play.loc[(regular_play.play_type.isin(PAT_miss_type)),'EP_end'] = 6
regular_play.loc[regular_play.play_type == 'Field Goal Good','EP_end'] = 3

In [39]:
regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']

In [40]:
pass_play_type = ["Interception Return Touchdown","Pass Interception","Pass Incompletion","Pass Interception Return","Pass Reception","Pass Completion","Pass","Passing Touchdown","Sack"]
rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']

In [41]:
# cfbscrapR value for this filter: 0.1202821
regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()

0.09776391834020615

In [42]:
# cfbscrapR value for this filter: -0.04072027
regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()

-0.13126854598522186

In [43]:
regular_play.groupby('play_type')['EPA'].mean()

play_type
Blocked Field Goal                   -0.369392
Blocked Field Goal Touchdown          0.927339
Blocked Punt                         -0.651105
Blocked Punt Touchdown               -5.162414
Defensive 2pt Conversion             -4.715701
Field Goal Good                       2.917672
Field Goal Missed                    -0.233752
Fumble Recovery (Opponent)           -2.966131
Fumble Recovery (Own)                -0.935137
Fumble Return Touchdown              -7.145353
Interception Return Touchdown        -6.973184
Missed Field Goal Return              0.512435
Missed Field Goal Return Touchdown   -6.531981
Pass                                 -3.158928
Pass Completion                       0.540609
Pass Incompletion                    -0.922729
Pass Interception                    -2.158391
Pass Interception Return             -2.210950
Pass Reception                        0.880414
Passing Touchdown                     4.127422
Punt                                 -0.692173
Pun

In [44]:
regular_play[regular_play.play_type.isin(pass_play_type)]['ppa'].mean()

0.16998962783460378

In [45]:
regular_play[regular_play.play_type.isin(rush_play_type)]['ppa'].mean()

0.06823612722365852

In [46]:
# turnovers
regular_play[regular_play.play_type.isin(["Fumble Recovery (Opponent)","Pass Interception Return","Sack"])].groupby('play_type')['EPA'].mean()

play_type
Fumble Recovery (Opponent)   -2.966131
Pass Interception Return     -2.210950
Sack                         -1.190471
Name: EPA, dtype: float32

In [47]:
# passes
regular_play[(regular_play.play_type.isin(pass_play_type)) & (~regular_play.play_type.isin(["Fumble Recovery (Own)", "Interception Return Touchdown"]))].groupby('play_type')['EPA'].mean()


play_type
Pass                       -3.158928
Pass Completion             0.540609
Pass Incompletion          -0.922729
Pass Interception          -2.158391
Pass Interception Return   -2.210950
Pass Reception              0.880414
Passing Touchdown           4.127422
Sack                       -1.190471
Name: EPA, dtype: float32

In [48]:
# Rushes
regular_play[(regular_play.play_type.isin(rush_play_type)) & (~regular_play.play_type.isin(["Fumble Recovery (Own)", "Fumble Return Touchdown"]))].groupby('play_type')['EPA'].mean()


play_type
Fumble Recovery (Opponent)   -2.966131
Rush                         -0.200240
Rushing Touchdown             2.681510
Name: EPA, dtype: float32

In [49]:
# in_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline'],'margin':regular_play['margin'], 'period':regular_play['period'],'drive_point':regular_play['drive_point'], 'log_distance':regular_play['log_distance'],'goal_to_go':regular_play['goal_to_go'],'time_remaining':regular_play['time_remaining'], 'under_two':regular_play['under_two'], 'yards_gained':regular_play['yards_gained']})
# y_in, X_in = dmatrices(reg_equation, in_df[(in_df.yards_gained == 4)], return_type='dataframe')
# X_test = X_in[(X_in["C(down)[T.2]"] == 0) & (X_in["C(down)[T.3]"] == 0) & (X_in["C(down)[T.4]"] == 0) & (X_in.distance == 10) & (X_in.adjusted_yardline == 75) & (X_in.period == 1) & (X_in.margin == 0)]
# X_test


In [50]:
# regular_play[(regular_play.play_type.isin(rush_play_type))].sort_values(by="EPA",ascending=True).head()

In [51]:
# in_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline'],'margin':regular_play['margin'], 'period':regular_play['period'],'drive_point':regular_play['drive_point'], 'log_distance':regular_play['log_distance'],'goal_to_go':regular_play['goal_to_go'],'time_remaining':regular_play['time_remaining'], 'under_two':regular_play['under_two'], 'yards_gained':regular_play['yards_gained']})
# y_in, X_in = dmatrices(reg_equation, in_df, return_type='dataframe')
# # X_test = X_in #[(X_in["C(down)[T.2]"] == 1) & (X_in["C(down)[T.3]"] == 0) & (X_in["C(down)[T.4]"] == 0) & (X_in.distance == 3) & (X_in.adjusted_yardline == 5) & (X_in.period == 1) & (X_in.margin == 10)]
# # X_test


In [52]:
# def row_op_update(row):
#     row["down"] = 2
#     row["C(down)[T.2]"] = 1
#     row["distance"] = 6
#     row["adjusted_yardline"] = 71
#     row["C(down)[T.2]:distance"] = 6
#     row["C(down)[T.2]:adjusted_yardline"] = 71
#     row['drive_point'] = 0
#     return row

# output_df = pd.DataFrame(X_test).apply(lambda x: row_op_update(x), axis=1)
# output_df

In [53]:
# y_out, X_out = dmatrices(reg_equation, X_test, return_type='dataframe')
# X_out

In [54]:
# EP_start = clf.predict_proba(X_test)
# epa_start = EP_start[:,0]* -7 + EP_start[:,1] * -3 + EP_start[:,2] * -2 + EP_start[:,4] * 2 + EP_start[:,5] * 3 + EP_start[:,6] * 7

# EP_end = clf.predict_proba(output_df[["Intercept","C(down)[T.2]","C(down)[T.3]","C(down)[T.4]","distance","C(down)[T.2]:distance","C(down)[T.3]:distance","C(down)[T.4]:distance","adjusted_yardline","C(down)[T.2]:adjusted_yardline",	"C(down)[T.3]:adjusted_yardline","C(down)[T.4]:adjusted_yardline","period","margin"]])
# epa_end = 7#EP_end[:,0]* -7 + EP_end[:,1] * -3 + EP_end[:,2] * -2 + EP_end[:,4] * 2 + EP_end[:,5] * 3 + EP_end[:,6] * 7

# epa = epa_end - epa_start
# pd.DataFrame(data={"start":epa_start,"end":epa_end,"epa":epa})

In [55]:
regular_play[regular_play.play_type.str.contains("Touchdown") & (regular_play.yard_line == 95) & (regular_play.play_text.str.contains("Greene"))].head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled,off_full_name,off_abbr,def_full_name,def_abbr,half,new_yardline,new_down,new_distance,turnover_end,next_offense,turnover_indicator,new_time_remaining,new_log_distance,new_goal_to_go,new_under_two,end_half_game,EP_start,TD,Opp_TD,FG,Opp_FG,No_Score,EP_end,EPA
124604,400547917103965201,Bowling Green,Mid-American,Indiana,Big Ten,Bowling Green,Indiana,26,21,40054791721,3,18,47,95,2,5,5,Rushing Touchdown,"Travis Greene run for 5 yds for a TD, (Tyler T...",2.438578,2014,400547917103965201,Bowling Green,0,5,5,1127,1.609438,True,False,5,0.939759,400547917,40054791721,7,TD,7,1.0,1.939759,0.954545,Bowling Green,BwGrn,Indiana,Ind,2,99.0,1,0,True,Indiana,0,1113.0,-0.693147,True,False,False,4.813544,0.613056,0.047959,0.305986,0.020547,0.005462,7.0,2.186456


In [56]:
# regular_play[(~regular_play.play_text.str.contains("penalty")) & (regular_play.down == 1) & (regular_play.yard_line == 25) & (regular_play.distance == 10) & (regular_play.yards_gained >= 10)].head()


In [57]:
# import coremltools

# y_end, X_end = dmatrices(reg_equation, out_df, return_type='dataframe')

# coreml_model = coremltools.converters.sklearn.convert(clf, X_end.columns.to_list(), "drive_point")
# coreml_model


In [58]:
# # CoreML model    0: 7,-7,3,-3,0,2,-2

import coremltools as cml
cml_model = cml.converters.xgboost.convert(xgb_model, force_32bit_float=False, mode="classifier", class_labels=[0,7,3,-7,-2], n_classes=len(class_to_score_mapping), feature_names=['Intercept', 'down2', 'down3', 'down4',
       'goal_to_go', 'under_two', 'time_remaining',
       'adjusted_yardline', 'adjusted_yardline_down_2',
       'adjusted_yardline_down_3', 'adjusted_yardline_down_4',
       'log_distance', 'log_distance_down_2',
       'log_distance_down_3', 'log_distance_down_4',
       'goal_to_go_log_distance'], target="next_drive_point")
cml_model.author = 'Saiem Gilani, Meyappan Subbiah, and Parker Fleming for R code; Parker Fleming for original Python model; Akshay Easwaran for updates.'
cml_model.license = 'MIT'
cml_model.short_description = 'Predicts expected points added by a football play given its context and activity. Translated from R, original model available as part of https://github.com/saiemgilani/cfbscrapR/.'


# Set feature descriptions manually
cml_model.input_description['Intercept'] = 'Constant.'
cml_model.input_description['down2'] = 'Signifies second down.'
cml_model.input_description['down3'] = 'Signifies third down.'
cml_model.input_description['down4'] = 'Signifies fourth down.'
cml_model.input_description['goal_to_go'] = 'Signifies that the number of yards to gain for a first down is fewer than the number of yards left to gain for a touchdown.'
cml_model.input_description['under_two'] = 'Notes that there are under two minutes remaining in the game.'
cml_model.input_description['time_remaining'] = 'The time remaining in the game.'
cml_model.input_description['adjusted_yardline'] = 'The yards left to gain towards the end zone.'
cml_model.input_description['adjusted_yardline_down_2'] = 'The yards left to gain towards the end zone on second down.'
cml_model.input_description['adjusted_yardline_down_3'] = 'The yards left to gain towards the end zone on third down.'
cml_model.input_description['adjusted_yardline_down_4'] = 'The yards left to gain towards the end zone on fourth down.'
cml_model.input_description['log_distance'] = 'The logarithm of the number of yards to gain a first down.'
cml_model.input_description['log_distance_down_2'] = 'The logarithm of the number of yards to gain a first down on second down.'
cml_model.input_description['log_distance_down_3'] = 'The logarithm of the number of yards to gain a first down on third down'
cml_model.input_description['log_distance_down_4'] = 'The logarithm of the number of yards to gain a first down on fourth down'
cml_model.input_description['goal_to_go_log_distance'] = 'The logarithm of the number of yards to gain a first down when in goal-to-go situations.'

# Set the output descriptions
cml_model.output_description['next_drive_point'] = 'The outcome of the drive, in points.'

# Save the model
cml_model.save('CFBEPA.mlmodel')
cml_model


  Specified output dimension (7) does not match the given number of classes (5).;".


input {
  name: "Intercept"
  shortDescription: "Constant."
  type {
    doubleType {
    }
  }
}
input {
  name: "down2"
  shortDescription: "Signifies second down."
  type {
    doubleType {
    }
  }
}
input {
  name: "down3"
  shortDescription: "Signifies third down."
  type {
    doubleType {
    }
  }
}
input {
  name: "down4"
  shortDescription: "Signifies fourth down."
  type {
    doubleType {
    }
  }
}
input {
  name: "goal_to_go"
  shortDescription: "Signifies that the number of yards to gain for a first down is fewer than the number of yards left to gain for a touchdown."
  type {
    doubleType {
    }
  }
}
input {
  name: "under_two"
  shortDescription: "Notes that there are under two minutes remaining in the game."
  type {
    doubleType {
    }
  }
}
input {
  name: "time_remaining"
  shortDescription: "The time remaining in the game."
  type {
    doubleType {
    }
  }
}
input {
  name: "adjusted_yardline"
  shortDescription: "The yards left to gain towards the en

In [59]:
# pd.merge(out_df, X_end, left_index=True, right_index=True)

In [60]:
X_test.iloc[6311]

Intercept                     1.000000
down2                         0.000000
down3                         0.000000
down4                         0.000000
goal_to_go                    0.000000
under_two                     0.000000
time_remaining              160.000000
adjusted_yardline            75.000000
adjusted_yardline_down_2      0.000000
adjusted_yardline_down_3      0.000000
adjusted_yardline_down_4      0.000000
log_distance                  2.302585
log_distance_down_2           0.000000
log_distance_down_3           0.000000
log_distance_down_4           0.000000
goal_to_go_log_distance       0.000000
Name: 785837, dtype: float64

In [61]:
grouped_plays = regular_play[~((regular_play.play_text.str.contains("penalty")) | (regular_play.play_text.str.contains("PENALTY")))].groupby('play_type')
grouped_plays['EPA'].mean()

play_type
Blocked Field Goal                   -0.369392
Blocked Field Goal Touchdown          0.927339
Blocked Punt                         -0.651105
Blocked Punt Touchdown               -5.162414
Defensive 2pt Conversion             -4.715701
Field Goal Good                       2.917672
Field Goal Missed                    -0.233752
Fumble Recovery (Opponent)           -2.966131
Fumble Recovery (Own)                -0.935137
Fumble Return Touchdown              -7.145353
Interception Return Touchdown        -6.973184
Missed Field Goal Return              0.512435
Missed Field Goal Return Touchdown   -6.531981
Pass                                 -3.158928
Pass Completion                       0.540609
Pass Incompletion                    -0.922729
Pass Interception                    -2.158391
Pass Interception Return             -2.210950
Pass Reception                        0.880414
Passing Touchdown                     4.127422
Punt                                 -0.692173
Pun

In [62]:
# "Field Goal Good","Passing Touchdown","Pass Completion","Fumble Return Touchdown","Rushing Touchdown","Interception","Pass Incompletion","Rush"
# ignore: Punt,"Safety","Sack"
selected_groups = ["Passing Touchdown","Fumble Return Touchdown","Rushing Touchdown","Pass Completion","Field Goal Good","Pass Interception","Pass Incompletion","Rush","Field Goal Missed"]
composite_df = grouped_plays.apply(lambda x: x.sample(1)).reset_index(drop=True)
composite_df[composite_df.play_type.isin(selected_groups)]

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled,off_full_name,off_abbr,def_full_name,def_abbr,half,new_yardline,new_down,new_distance,turnover_end,next_offense,turnover_indicator,new_time_remaining,new_log_distance,new_goal_to_go,new_under_two,end_half_game,EP_start,TD,Opp_TD,FG,Opp_FG,No_Score,EP_end,EPA
5,400869414101908901,UTEP,Conference USA,Rice,Conference USA,Rice,UTEP,3,0,4008694142,1,24,10,18,4,3,37,Field Goal Good,Jay Mattox 37 yd FG GOOD,,2016,400869414101908901,Rice,1,18,3,1450,1.098612,True,False,3,0.963855,400869414,4008694142,3,FG,3,1.0,1.963855,0.972727,UTEP,UTEP,Rice,Rice,1,99.0,1,0,True,Rice,0,1450.0,-0.693147,True,False,False,1.758048,0.51361,0.275591,0.11094,0.083246,0.005885,3.0,1.241952
6,322800150258,Virginia,ACC,Duke,ACC,Duke,Virginia,17,28,32280015023,3,17,12,28,4,2,45,Field Goal Missed,Drew Jarrett 45 yard field goal MISSED.,,2012,322800150258,Duke,1,28,-11,1032,0.693147,False,False,11,0.86747,322800150,32280015024,0,FG MISSED,-7,1.0,1.86747,0.9,Virginia,Virg,Duke,Duke,2,73.0,1,10,True,Duke,1,1022.0,2.302585,False,False,False,1.192804,0.44445,0.284864,0.13895,0.115083,0.007788,-0.473089,-1.665892
9,401012289101909801,Missouri,SEC,Georgia,SEC,Missouri,Georgia,0,7,4010122892,1,24,1,60,3,9,64,Fumble Return Touchdown,Tyson Campbell 64 Yd Fumble Return (Rodrigo Bl...,,2018,401012289101909801,Missouri,0,40,-7,1441,2.197225,False,False,7,0.915663,401012289,4010122892,-7,FUMBLE RETURN TD,-7,1.0,1.915663,0.936364,Missouri,Misso,Georgia,Geo,1,99.0,1,10,True,Missouri,1,1435.0,2.302585,True,False,False,0.795774,0.397421,0.313269,0.167811,0.101117,0.009022,-7.0,-7.795774
14,333130023070,San José State,Mountain West,San Diego State,Mountain West,San José State,San Diego State,10,6,33313002311,2,11,45,83,3,7,10,Pass Completion,David Fales pass complete to Jarrod Lawson for...,0.939572,2013,333130023070,San José State,0,17,4,705,1.94591,False,False,4,0.951807,333130023,33313002311,3,FG GOOD,3,1.0,1.951807,0.963636,San José State,SJSt,San Diego State,SDSt,1,7.0,1,17,False,San José State,0,705.0,2.833213,True,False,False,2.660062,0.337499,0.143136,0.465544,0.033261,0.011685,4.958576,2.298514
15,400763541101985101,Indiana,Big Ten,Rutgers,Big Ten,Indiana,Rutgers,10,7,4007635416,1,16,48,28,1,10,0,Pass Incompletion,Nate Sudfeld pass incomplete to Ricky Jones,-0.798201,2015,400763541101985101,Indiana,0,72,3,1008,2.302585,False,False,3,0.963855,400763541,4007635416,7,TD,7,1.0,1.963855,0.972727,Indiana,Ind,Rutgers,Rutgr,1,72.0,2,10,False,Indiana,0,1008.0,2.302585,False,False,False,0.41094,0.398224,0.343815,0.123199,0.114127,0.010378,-0.493864,-0.904804
16,332570254142,Utah,Pac-12,Oregon State,Pac-12,Utah,Oregon State,10,20,33257025412,3,29,6,19,3,9,27,Pass Interception,Travis Wilson pass intercepted by Sean Martin ...,0.193142,2013,332570254142,Utah,0,81,-10,1746,2.197225,False,False,10,0.879518,332570254,33257025412,-7,INT TD,-7,1.0,1.879518,0.909091,Utah,Utah,Oregon State,OrgSt,2,46.0,1,10,True,Utah,1,1746.0,2.302585,False,False,False,-1.344512,0.277195,0.446104,0.100871,0.155214,0.010033,-2.867097,-1.522585
19,400869561102969801,Navy,American Athletic,Tulsa,American Athletic,Navy,Tulsa,28,17,4008695619,2,3,1,89,3,11,11,Passing Touchdown,Will Worth pass complete to Tyler Carmona for ...,,2016,400869561102969801,Navy,0,11,11,181,2.397895,True,False,11,0.86747,400869561,4008695619,7,TD,7,1.0,1.86747,0.9,Navy,Navy,Tulsa,Tulsa,1,99.0,1,0,True,Tulsa,0,181.0,-0.693147,True,False,False,2.32957,0.352797,0.164261,0.383175,0.048492,0.040902,7.0,4.67043
22,400869696102997804,Northwestern,Big Ten,Minnesota,Big Ten,Minnesota,Northwestern,0,12,40086969616,2,0,21,65,2,10,-1,Rush,TEAM run for a loss of 1 yard to the Nwest 34,,2016,400869696102997804,Minnesota,1,65,-12,21,2.302585,False,True,12,0.855422,400869696,40086969616,0,END OF HALF,0,1.0,1.855422,0.890909,Northwestern,Nwest,Minnesota,Minn,1,66.0,3,11,False,Northwestern,0,1800.0,2.397895,False,False,False,-1.124329,0.039043,0.180435,0.042979,0.088553,0.642536,-0.141843,0.982486
23,401110851101869101,Alabama,SEC,Mississippi State,SEC,Mississippi State,Alabama,7,0,4011108511,1,28,8,10,2,2,10,Rushing Touchdown,"Najee Harris run for 10 yds for a TD, (Joseph ...",2.138737,2019,401110851101869101,Mississippi State,1,10,7,1688,0.693147,False,False,7,0.915663,401110851,4011108511,7,TD,7,1.0,1.915663,0.936364,Alabama,Alab,Mississippi State,MisSt,1,99.0,1,0,True,Mississippi State,0,1679.0,-0.693147,True,False,False,4.820768,0.626366,0.051674,0.287696,0.022053,0.005526,7.0,2.179232


In [63]:
composite_df.iloc[15].play_text

'Nate Sudfeld pass incomplete to Ricky Jones'

In [65]:
for c in range(0, 7):
    regular_play[class_to_name_mapping[c]] = EP_predict_start[:,c]
regular_play.head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,period,clock.minutes,clock.seconds,yard_line,down,distance,yards_gained,play_type,play_text,ppa,year,play_id,home_team,coef,adjusted_yardline,margin,time_remaining,log_distance,goal_to_go,under_two,abs_diff,ScoreDiff_W,game_id,DSH,drive_point,drive_result,next_drive_point,Drive_Score_Dist_W,Total_W,Total_W_Scaled,off_full_name,off_abbr,def_full_name,def_abbr,half,new_yardline,new_down,new_distance,turnover_end,next_offense,turnover_indicator,new_time_remaining,new_log_distance,new_goal_to_go,new_under_two,end_half_game,EP_start,TD,Opp_TD,FG,Opp_FG,No_Score,EP_end,EPA,Safety,Opp_Safety
657094,322430041002,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,45,80,1,10,1,Rush,Chris Burns rush for 1 yard to the UMass 21.,-0.468828,2012,322430041002,Connecticut,1,80,0,1785,2.302585,False,False,0,1.0,322430041,32243004102,0,PUNT,-7,1.0,2.0,1.0,UMass,UMass,Connecticut,UConn,1,79.0,2,9,False,UMass,0,1760.0,2.197225,False,False,False,0.699295,0.405101,0.3176,0.142792,0.114353,0.010309,-0.237787,-0.937081,0.00529,0.004555
657095,322430041003,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,20,79,2,9,0,Pass Incompletion,Mike Wegzyn pass incomplete.,-0.310093,2012,322430041003,Connecticut,1,79,0,1760,2.197225,False,False,0,1.0,322430041,32243004102,0,PUNT,-7,1.0,2.0,1.0,UMass,UMass,Connecticut,UConn,1,79.0,3,9,False,UMass,0,1750.0,2.197225,False,False,False,-0.237787,0.342944,0.377006,0.12994,0.130254,0.009771,-1.298674,-1.060887,0.005439,0.004647
657096,322430041005,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,29,10,79,3,9,5,Pass Completion,Mike Wegzyn pass complete to Marken Michel for...,0.019784,2012,322430041005,Connecticut,1,79,0,1750,2.197225,False,False,0,1.0,322430041,32243004102,0,PUNT,-7,1.0,2.0,1.0,UMass,UMass,Connecticut,UConn,1,74.0,4,4,False,UMass,0,1708.0,1.386294,False,False,False,-1.298674,0.280342,0.448131,0.104936,0.146885,0.00945,-0.222502,1.076171,0.005552,0.004703
657097,322430041006,UMass,Mid-American,Connecticut,Big East,Connecticut,UMass,0,0,32243004101,1,28,28,74,4,4,-2,Punt,"Jeff Strait punt for 47 yards, returned by Nic...",,2012,322430041006,Connecticut,1,74,0,1708,1.386294,False,False,0,1.0,322430041,32243004102,0,PUNT,-7,1.0,2.0,1.0,UMass,UMass,Connecticut,UConn,1,24.0,1,10,True,Connecticut,1,1700.0,2.302585,False,False,False,-0.222502,0.368683,0.369151,0.07722,0.114588,0.006726,-4.141266,-3.918764,0.005036,0.058596
570576,322430041008,Connecticut,Big East,UMass,Mid-American,Connecticut,UMass,0,0,32243004102,1,28,20,25,1,10,5,Rush,Lyle McCombs rush for 5 yards to the UConn 30.,0.094545,2012,322430041008,Connecticut,0,75,0,1700,2.302585,False,False,0,1.0,322430041,32243004102,7,RUSHING TD,7,1.0,2.0,1.0,Connecticut,UConn,UMass,UMass,1,70.0,2,5,False,Connecticut,0,1666.0,1.609438,False,False,False,0.747652,0.40784,0.317276,0.147482,0.109608,0.008852,0.513041,-0.23461,0.004492,0.004449


In [159]:
score_to_name_mapping = {
    7: "TD",
    -7: "Opp_TD",
    3: "FG",
    -3: "Opp_FG",
    0: "No_Score",
    2: "Safety",
    -2: "Opp_Safety"
}

next_score_types = ["TD","Opp_TD","FG","Opp_FG","No_Score", "Safety","Opp_Safety"]
regular_play['next_score'] = regular_play.next_drive_point.apply(lambda x: score_to_name_mapping[x])
gathered = regular_play.melt(id_vars=["year","next_score"],value_vars=next_score_types, var_name="next_score_type",value_name="pred_prob")
gathered

  del sys.path[0]


Unnamed: 0,year,next_score,next_score_type,pred_prob
0,2012,Opp_TD,TD,0.405101
1,2012,Opp_TD,TD,0.342944
2,2012,Opp_TD,TD,0.280342
3,2012,Opp_TD,TD,0.368683
4,2012,TD,TD,0.407840
...,...,...,...,...
6017076,2019,TD,Opp_Safety,0.003310
6017077,2019,TD,Opp_Safety,0.003319
6017078,2019,No_Score,Opp_Safety,0.003706
6017079,2019,No_Score,Opp_Safety,0.003577


In [161]:
gathered['bin_pred_prob'] = (round(gathered.pred_prob / 0.05) * .05)
# gathered['n_plays'] = 
gathered

Unnamed: 0,year,next_score,next_score_type,pred_prob,bin_pred_prob,n_plays
0,2012,Opp_TD,TD,0.405101,0.40,6017081
1,2012,Opp_TD,TD,0.342944,0.35,6017081
2,2012,Opp_TD,TD,0.280342,0.30,6017081
3,2012,Opp_TD,TD,0.368683,0.35,6017081
4,2012,TD,TD,0.407840,0.40,6017081
...,...,...,...,...,...,...
6017076,2019,TD,Opp_Safety,0.003310,0.00,6017081
6017077,2019,TD,Opp_Safety,0.003319,0.00,6017081
6017078,2019,No_Score,Opp_Safety,0.003706,0.00,6017081
6017079,2019,No_Score,Opp_Safety,0.003577,0.00,6017081


In [162]:
grouped_score = gathered.groupby(['next_score_type','bin_pred_prob'])
grouped_score.groups.keys()

dict_keys([('FG', 0.0), ('FG', 0.05000000074505806), ('FG', 0.10000000149011612), ('FG', 0.15000000596046448), ('FG', 0.20000000298023224), ('FG', 0.25), ('FG', 0.30000001192092896), ('FG', 0.3499999940395355), ('FG', 0.4000000059604645), ('FG', 0.45000001788139343), ('FG', 0.5), ('FG', 0.550000011920929), ('FG', 0.6000000238418579), ('FG', 0.6500000357627869), ('FG', 0.699999988079071), ('FG', 0.75), ('No_Score', 0.0), ('No_Score', 0.05000000074505806), ('No_Score', 0.10000000149011612), ('No_Score', 0.15000000596046448), ('No_Score', 0.20000000298023224), ('No_Score', 0.25), ('No_Score', 0.30000001192092896), ('No_Score', 0.3499999940395355), ('No_Score', 0.4000000059604645), ('No_Score', 0.45000001788139343), ('No_Score', 0.5), ('No_Score', 0.550000011920929), ('No_Score', 0.6000000238418579), ('No_Score', 0.6500000357627869), ('No_Score', 0.699999988079071), ('No_Score', 0.75), ('Opp_FG', 0.0), ('Opp_FG', 0.05000000074505806), ('Opp_FG', 0.10000000149011612), ('Opp_FG', 0.150000005

In [166]:
# https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns

def organize(grp):
    d = {}
    d['n_plays'] = len(grp)
    d['n_scoring_events'] = len(grp[grp.next_score == grp.next_score_type])
    d['bin_actual_prob'] = d['n_scoring_events'] / d['n_plays']
    return pd.Series(d, index=['n_plays','n_scoring_events','bin_actual_prob'])

organized = grouped_score.apply(organize)
organized = organized.reset_index()
organized

Unnamed: 0,next_score_type,bin_pred_prob,n_plays,n_scoring_events,bin_actual_prob
0,FG,0.0,7449.0,209.0,0.028057
1,FG,0.05,108105.0,11479.0,0.106184
2,FG,0.1,241686.0,25564.0,0.105774
3,FG,0.15,240226.0,34663.0,0.144293
4,FG,0.2,116023.0,22184.0,0.191203
5,FG,0.25,68012.0,16208.0,0.238311
6,FG,0.3,27137.0,7300.0,0.269005
7,FG,0.35,22454.0,7058.0,0.314332
8,FG,0.4,9662.0,3359.0,0.347651
9,FG,0.45,7962.0,3089.0,0.387968


In [167]:
organized['cal_diff'] = abs(organized.bin_pred_prob - organized.bin_actual_prob)
organized


Unnamed: 0,next_score_type,bin_pred_prob,n_plays,n_scoring_events,bin_actual_prob,cal_diff
0,FG,0.0,7449.0,209.0,0.028057,0.028057
1,FG,0.05,108105.0,11479.0,0.106184,0.056184
2,FG,0.1,241686.0,25564.0,0.105774,0.005774
3,FG,0.15,240226.0,34663.0,0.144293,0.005707
4,FG,0.2,116023.0,22184.0,0.191203,0.008797
5,FG,0.25,68012.0,16208.0,0.238311,0.011689
6,FG,0.3,27137.0,7300.0,0.269005,0.030995
7,FG,0.35,22454.0,7058.0,0.314332,0.035668
8,FG,0.4,9662.0,3359.0,0.347651,0.052349
9,FG,0.45,7962.0,3089.0,0.387968,0.062032


In [171]:
def organize_cal_err(grp):
    d = {}
    d['weight_cal_err'] = np.average(grp.cal_diff, weights=grp.n_plays)
    d['n_scoring_events'] = sum(grp.n_scoring_events)
    return pd.Series(d, index=['weight_cal_err', 'n_scoring_events'])

cv_cal_error = organized.groupby('next_score_type')
final_cal_error = cv_cal_error.apply(organize_cal_err)
final_cal_error = final_cal_error.reset_index()
final_cal_error

Unnamed: 0,next_score_type,weight_cal_err,n_scoring_events
0,FG,0.016252,136580.0
1,No_Score,0.012035,50814.0
2,Opp_FG,0.005068,79027.0
3,Opp_Safety,0.004134,1663.0
4,Opp_TD,0.006947,238917.0
5,Safety,0.002898,2523.0
6,TD,0.007636,350059.0


In [174]:
weighted_cal_err_final = np.average(final_cal_error.weight_cal_err, weights=final_cal_error.n_scoring_events)
print(f"Weighted calibration error: {weighted_cal_err_final}")

Weighted calibration error: 0.008816780647086464
