In [None]:
import math
import pandas as pd
from dateutil.parser import parse as parse_date
from sklearn.linear_model import LogisticRegression, Ridge, PoissonRegressor

from pbp.data import (
    CURRENT_SEASON,
    BASELINES_PATH,
    MODELS_PATH,
    LOGR_KWARGS,
    POIS_KWARGS,
    add_clock_stops,
    add_playcall_features,
    make_proe_data, 
    make_penalty_zs,
    load_pbp_data,
    common_timeout_features,
    only_def_timeout_features,
    only_off_timeout_features,
    off_timeout_features,
    def_timeout_features,
    all_timeout_features,
    state_features,
)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
try:
    pbp.head(1)
except:
    pbp_copy = load_pbp_data(range(2017, CURRENT_SEASON + 1))

In [None]:
pbp = pbp_copy.copy()

drive_keys = ['game_id','drive','posteam','defteam', 'posteam_score', 'defteam_score']
drive_teams = (
    pbp[~pbp['posteam'].isna() & ~pbp['defteam'].isna() & ~pbp['posteam_score'].isna() & ~pbp['defteam_score'].isna()]
    [drive_keys]
    .groupby(drive_keys, as_index=False)
    .nth(0)
    .rename(columns={'posteam': 'drive_offense', 'defteam': 'drive_defense', 'posteam_score': 'off_score', 'defteam_score': 'def_score'})
)
pbp['playcall'] = ''
pbp = pbp.merge(drive_teams, on=['game_id', 'drive'], how='left').drop_duplicates(subset=['play_id','game_id'])

pbp.loc[pbp['off_score'].isna(), 'off_score'] = pbp['posteam_score']
pbp.loc[pbp['def_score'].isna(), 'def_score'] = pbp['defteam_score']


In [None]:
drive_teams.head()

In [None]:
(offense_proes, defense_proes, offense_rz_proes, defense_rz_proes) = make_proe_data(pbp)

In [None]:
OFFENSIVE_PENALTIES_NOPLAY = {
    # presnap
    'Delay of Game',
    'False Start',
    'Illegal Formation',
    'Illegal Motion',
    'Illegal Shift',
    'Illegal Substitution',
    'Offensive 12 On-field',
    'Offensive Offside',
    'Offensive Too Many Men on Field',
    # no play
    'Chop Block',
    'Clipping',
    'Disqualification',
    'Face Mask',
    'Illegal Blindside Block',
    'Illegal Block Above the Waist',
    'Illegal Crackback',
    'Illegal Formation',
    'Illegal Forward Pass',
    'Illegal Peelback',
    'Illegal Touch Kick',
    'Illegal Touch Pass',
    'Ineligible Downfield Kick',
    'Ineligible Downfield Pass',
    'Low Block',
    'Lowering the Head to Initiate Contact',
    'Offensive Holding',
    'Offensive Pass Interference',
    'Player Out of Bounds on Kick',
    'Player Out of Bounds on Punt',
    'Taunting',
    'Tripping',
    'Unnecessary Roughness',
    'Unsportsmanlike Conduct',
 }

DEFENSIVE_PENALTIES_NOPLAY = {
    # pre snap
    'Neutral Zone Infraction',
    'Defensive Offside',
    'Defensive 12 On-field',
    'Defensive Delay of Game',
    'Defensive Too Many Men on Field',
    'Delay of Game',
    'Encroachment',
    'Illegal Formation',
    'Illegal Substitution',
    # kicker
    'Roughing the Kicker',
    'Running Into the Kicker',
    # post snap
    'Roughing the Passer',
    'Defensive Holding',
    'Defensive Pass Interference',
    'Disqualification',
    'Face Mask',
    'Horse Collar Tackle',
    'Illegal Contact',
    'Illegal Use of Hands',
    'Leverage',
    'Low Block',
    'Lowering the Head to Initiate Contact',
    'Lowering the Head to Make Forcible Contact',
    'Taunting',
    'Tripping',
    'Unnecessary Roughness',
    'Unsportsmanlike Conduct', 
}


penalties = (pbp['play_type'] == 'no_play') & (pbp['penalty'] == 1)
offensive_penalty = pbp['penalty_team'] == pbp['posteam']
defensive_penalty = pbp['penalty_team'] == pbp['defteam']

presnap_penalties = (
    penalties
    & (
        (offensive_penalty & pbp['penalty_type'].isin(OFFENSIVE_PENALTIES_NOPLAY))
        | (defensive_penalty & pbp['penalty_type'].isin(DEFENSIVE_PENALTIES_NOPLAY))
    ) & (pbp['qb_kneel'] == 0)
)

pbp.loc[presnap_penalties & offensive_penalty, 'playcall'] = 'PENALTY_OFFENSE'
pbp.loc[presnap_penalties & defensive_penalty, 'playcall'] = 'PENALTY_DEFENSE'

In [None]:
"""
FG / PUNT / QB KNEELS / SPIKES / TIMEOUTS
"""
punt = (pbp['play_type'] == 'punt')

pbp.loc[pbp['qb_kneel'] == 1, 'playcall'] = 'QB_KNEEL'
pbp.loc[pbp['qb_spike'] == 1, 'playcall'] = 'QB_SPIKE'

# results already done for us in 'field_goal_result':
#   - 'made'
#   - 'missed'
#   - 'blocked'
pbp.loc[pbp['play_type'] == 'field_goal', 'playcall'] = 'FG_ATTEMPT'
pbp.loc[punt, 'playcall'] = 'PUNT'

pbp['punt_result'] = ''
pbp.loc[punt, 'punt_result'] = 'RETURNED'
pbp.loc[punt & (pbp['return_touchdown'] == 0) & (pbp['fumble_lost'] == 0) & (pbp['penalty'] == 1) & offensive_penalty, 'punt_result'] = 'RETURNED_OFFENSIVE_PENALTY'
pbp.loc[punt & (pbp['return_touchdown'] == 0) & (pbp['fumble_lost'] == 0) & (pbp['penalty'] == 1) & defensive_penalty, 'punt_result'] = 'RETURNED_DEFENSIVE_PENALTY'
pbp.loc[punt & (pbp['return_touchdown'] == 0) & (pbp['fumble_lost'] == 1), 'punt_result'] = 'FUMBLE_LOST'
pbp.loc[punt & (pbp['return_touchdown'] == 1) & (pbp['fumble_lost'] == 1), 'punt_result'] = 'FUMBLE_LOST_TD'
pbp.loc[punt & (pbp['return_touchdown'] == 1) & (pbp['fumble_lost'] == 0), 'punt_result'] = 'PUNT_RETURN_TD'

timeouts = (pbp['play_type'] == 'no_play') & (pbp['timeout'] == 1)
off_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_offense'])
def_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_defense'])
tv_timeout = timeouts & (~off_timeout) & (~def_timeout)

# pbp.loc[tv_timeout, 'playcall'] = 'NEUTRAL_TIMEOUT'
pbp.loc[off_timeout, 'playcall'] = 'OFFENSIVE_TIMEOUT'
pbp.loc[def_timeout, 'playcall'] = 'DEFENSIVE_TIMEOUT'

In [None]:
"""
DESIGNED RUSH OUTCOMES

'fumble_lost' = 1
    'return_touchdown' = 1
'safety' = 1
'rush_touchdown' = 1

'yards_gained' (non-everything else)
"""

designed_runs = (
    (
        (pbp['play_type'] == 'run')
        & (pbp['qb_scramble'] == 0)
        & (pbp['qb_kneel'] == 0)
        & (pbp['two_point_attempt'] == 0)
    )
)

# designed run & sub-outcomes
pbp.loc[designed_runs, 'playcall'] = 'DESIGNED_RUN'

pbp['designed_run'] = ''
pbp.loc[designed_runs, 'designed_run'] = 'YARDS'
pbp.loc[designed_runs & (pbp['penalty'] == 1) & offensive_penalty, 'designed_run'] = 'OFFENSIVE_PENALTY'
pbp.loc[designed_runs & (pbp['penalty'] == 1) & defensive_penalty, 'designed_run'] = 'DEFENSIVE_PENALTY'
pbp.loc[designed_runs & (pbp['fumble_lost'] == 1), 'designed_run'] = 'FUMBLE_LOST'
pbp.loc[designed_runs & (pbp['rush_touchdown'] == 1), 'designed_run'] = 'RUSHING_TD'
pbp.loc[designed_runs & (pbp['safety'] == 1), 'designed_run'] = 'SAFETY'

In [None]:
dropback = ((pbp['play_type'] == 'pass') | (pbp['qb_scramble'] == 1)) & (pbp['two_point_attempt'] != 1)
pbp.loc[dropback, 'playcall'] = 'DROPBACK'

pbp['dropback'] = ''
pbp.loc[dropback, 'dropback'] = 'UNLABELED'
pbp.loc[dropback & (pbp['pass_attempt'] == 1) & ~pbp['receiver_player_id'].isna(), 'dropback'] = 'TARGETED_RECEIVER'

pbp.loc[pbp['qb_scramble'] == 1, 'dropback'] = 'QB_SCRAMBLE'
pbp.loc[dropback & (pbp['pass_attempt'] == 1) & pbp['receiver_player_id'].isna(), 'dropback'] = 'THROWAWAY'


pbp.loc[dropback & (pbp['sack'] == 1), 'dropback'] = 'SACK'

pbp['sack_result'] = ''
pbp.loc[pbp['sack'] == 1, 'sack_result'] = 'LOSS_OF_YARDS'
pbp.loc[(pbp['sack'] == 1) & (pbp['safety'] == 1), 'sack_result'] = 'SAFETY'
pbp.loc[(pbp['sack'] == 1) & (pbp['fumble_lost'] == 1) & (pbp['return_touchdown'] == 0), 'sack_result'] = 'FUMBLE_LOST'
pbp.loc[(pbp['sack'] == 1) & (pbp['fumble_lost'] == 1) & (pbp['return_touchdown'] == 1), 'sack_result'] = 'FUMBLE_LOST_TD'

# might mess with targeted if both occurred
pbp.loc[dropback & offensive_penalty & (pbp['penalty'] == 1) & (pbp['complete_pass'] == 0), 'dropback'] = 'OFFENSIVE_PENALTY'
pbp.loc[dropback & defensive_penalty & (pbp['penalty'] == 1) & (pbp['complete_pass'] == 0), 'dropback'] = 'DEFENSIVE_PENALTY'

pbp['target_result'] = ''
target = pbp['dropback'] == 'TARGETED_RECEIVER'
pbp.loc[target, 'target_result'] = 'UNLABELED'
pbp.loc[target & (pbp['pass_touchdown'] == 1), 'target_result'] = 'TD'
pbp.loc[target & (pbp['interception'] == 1) & (pbp['return_touchdown'] == 0), 'target_result'] = 'INT'
pbp.loc[target & (pbp['interception'] == 1) & (pbp['return_touchdown'] == 1), 'target_result'] = 'PICK_SIX'
pbp.loc[target & (pbp['incomplete_pass'] == 1) & (pbp['fumble_lost'] == 0), 'target_result'] = 'INCOMPLETE'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 0), 'target_result'] = 'CATCH'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 0) & (pbp['penalty'] == 1) & defensive_penalty, 'target_result'] = 'CATCH_AND_PENALTY'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 1), 'target_result'] = 'CATCH_AND_FUMBLE'


In [None]:
pbp['playcall'].value_counts()

In [None]:
pbp['home_timeout'] = 0
pbp['away_timeout'] = 0

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_offense'] == pbp['home_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_defense'] == pbp['home_team']).astype(int)

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_offense'] == pbp['away_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_defense'] == pbp['away_team']).astype(int)

pbp['home_timeout_prev'] = pbp['home_timeout'].shift(1)
pbp['away_timeout_prev'] = pbp['away_timeout'].shift(1)
pbp['duplicate_timeout'] = (
    ((pbp['home_timeout_prev'] == 1) & (pbp['home_timeout'] == 1))
    | ((pbp['away_timeout_prev'] == 1) & (pbp['away_timeout'] == 1))
)
pbp.loc[pbp['home_timeout_prev'] == 1, 'home_timeout'] = 0
pbp.loc[pbp['away_timeout_prev'] == 1, 'away_timeout'] = 0

pbp.loc[pbp['duplicate_timeout'], 'timeout'] = 'DUPLICATE'
pbp.loc[pbp['duplicate_timeout'], 'playcall'] = 'DUPLICATE_TIMEOUT'

pbp['total_home_timeouts'] = pbp.groupby(['game_id','home_team', 'game_half'])['home_timeout'].cumsum()
pbp['total_away_timeouts'] = pbp.groupby(['game_id','away_team', 'game_half'])['away_timeout'].cumsum()

pbp['off_timeouts_left'] = 3
pbp['def_timeouts_left'] = 3

pbp.loc[pbp['drive_offense'] == pbp['home_team'], 'off_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_offense'] == pbp['away_team'], 'off_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp.loc[pbp['drive_defense'] == pbp['home_team'], 'def_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_defense'] == pbp['away_team'], 'def_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp['off_timeouts_remaining'] = pbp['posteam_timeouts_remaining'].combine_first(pbp['off_timeouts_left']).astype(int).clip(0, 3)
pbp['def_timeouts_remaining'] = pbp['defteam_timeouts_remaining'].combine_first(pbp['def_timeouts_left']).astype(int).clip(0, 3)

pbp['down'] = pbp['down'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)
pbp['qtr'] = pbp['qtr'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)


In [None]:
pbp['timeout'] = 'PLAY'
for cat in ['NEUTRAL_TIMEOUT', 'DEFENSIVE_TIMEOUT', 'OFFENSIVE_TIMEOUT']:
    pbp.loc[pbp['playcall'] == cat, 'timeout'] = cat

In [None]:
add_clock_stops(pbp)

In [None]:
responses = [
    'timeout',
    'playcall',
    'designed_run',
    'dropback',
    'punt_result',
    'field_goal_result',
    # dropback outcomes
    'target_result',
    'sack_result',
    # other
    'extra_point_attempt',
    'two_point_attempt',
    'play_id',
    'game_id',
    'location',
]

penalties = [
    'penalty_yards',
    'penalty_team',
]

raw_features = [
    'down',
    'ydstogo',
    'goal_to_go',
    'qtr', 
    'clock_running',
    # 'quarter_seconds_remaining',
    'yardline_100',
    'off_score',
    'def_score',
    # 'play_clock',
    'game_seconds_remaining',
    'half_seconds_remaining',
    # 'wp', 
    'qb_scramble',
    'off_timeouts_remaining',
    'def_timeouts_remaining',
    # 'total', 'spread_line',
]

cols = responses + ['posteam', 'defteam', 'season', 'home_team'] + raw_features + penalties

In [None]:
dataset = (
    pd.get_dummies(pbp[cols], columns=['down', 'qtr', 'off_timeouts_remaining', 'def_timeouts_remaining'])
    .merge(offense_proes, how='left', on=['posteam','season'])
    .merge(defense_proes, how='left', on=['defteam','season'])
    .merge(offense_rz_proes, how='left', on=['posteam','season'])
    .merge(defense_rz_proes, how='left', on=['defteam','season'])
)

add_playcall_features(dataset)


In [None]:
NON_OFFENSIVE_TIMEOUTS = {'PUNT', 'PENALTY_OFFENSE', 'PENALTY_DEFENSE', 'DEFENSIVE_TIMEOUT', 'FG_ATTEMPT', 'QB_KNEEL', 'QB_SPIKE', 'DESIGNED_RUN', 'DROPBACK'}

off_calls_timeout = (dataset['timeout'] == 'OFFENSIVE_TIMEOUT')
off_timeout_dataset = dataset[off_calls_timeout | (dataset['playcall'].isin(NON_OFFENSIVE_TIMEOUTS) & (dataset['off_timeouts_remaining_0'] == 0))][['timeout'] + off_timeout_features].dropna()
off_timeout_dataset['offensive_timeout'] = (dataset['timeout'] == 'OFFENSIVE_TIMEOUT').astype(int)

In [None]:
off_timeout_model = LogisticRegression(**LOGR_KWARGS)
off_timeout_model.fit(X=off_timeout_dataset[off_timeout_features], y=off_timeout_dataset['offensive_timeout'])

ins_pred = off_timeout_model.predict_proba(off_timeout_dataset[off_timeout_features])

print(ins_pred.mean(axis=0))

In [None]:
# print("pub struct OffensiveTimeoutModel {")
# print("    pub intercept: f32,")
# for f in off_timeout_features:
#     print(f'    pub {f}: f32,')
# print("}\n\n")

ot_model_str_parts = ["use crate::models::offensive_timeout::OffensiveTimeoutModel;\n", "impl OffensiveTimeoutModel {"]
ot_model_str_parts.append("    pub fn coefs() -> OffensiveTimeoutModel {")
ot_model_str_parts.append("        OffensiveTimeoutModel {")
ot_model_str_parts.append(f"            intercept: {off_timeout_model.intercept_[0]:.4f},")
for f, c in zip(off_timeout_features, off_timeout_model.coef_[0]):
    ot_model_str_parts.append(f'            {f}: {c:.4f},')
ot_model_str_parts.append("        }")
ot_model_str_parts.append("    }")
ot_model_str_parts.append("}")

ot_model_str = "\n".join(ot_model_str_parts)

with open(f'{MODELS_PATH}/offensive_timeout/coef.rs', 'w') as f:
    f.write(ot_model_str)

In [None]:
NON_DEFENSIVE_TIMEOUTS = {'PUNT', 'PENALTY_OFFENSE', 'PENALTY_DEFENSE', 'FG_ATTEMPT', 'QB_KNEEL', 'QB_SPIKE', 'DESIGNED_RUN', 'DROPBACK'}

def_calls_timeout = (dataset['timeout'] == 'DEFENSIVE_TIMEOUT')
def_timeout_dataset = dataset[def_calls_timeout | (dataset['playcall'].isin(NON_DEFENSIVE_TIMEOUTS) & (dataset['def_timeouts_remaining_0'] == 0))][['timeout'] + def_timeout_features].dropna()
def_timeout_dataset['defensive_timeout'] = (dataset['timeout'] == 'DEFENSIVE_TIMEOUT').astype(int)

In [None]:
def_timeout_model = LogisticRegression(**LOGR_KWARGS)
def_timeout_model.fit(X=def_timeout_dataset[def_timeout_features], y=def_timeout_dataset['defensive_timeout'])

ins_pred = def_timeout_model.predict_proba(def_timeout_dataset[def_timeout_features])

print(ins_pred.mean(axis=0))

In [None]:
# print("pub struct DefensiveTimeoutModel {")
# print("    pub intercept: f32,")
# for f in def_timeout_features:
#     print(f'    pub {f}: f32,')
# print("}\n\n")

dt_model_parts_str = ["use crate::models::defensive_timeout::DefensiveTimeoutModel;\n", "impl DefensiveTimeoutModel {"]
dt_model_parts_str.append("pub fn coefs() -> DefensiveTimeoutModel {")
dt_model_parts_str.append("    DefensiveTimeoutModel {")
dt_model_parts_str.append(f"            intercept: {def_timeout_model.intercept_[0]:.4f},")
for f, c in zip(def_timeout_features, def_timeout_model.coef_[0]):
    dt_model_parts_str.append(f'            {f}: {c:.4f},')
dt_model_parts_str.append("        }")
dt_model_parts_str.append("    }")
dt_model_parts_str.append("}")

dt_model_str = "\n".join(dt_model_parts_str)

with open(f'{MODELS_PATH}/defensive_timeout/coef.rs', 'w') as f:
    f.write(dt_model_str)

In [None]:
off_penalties, def_penalties = make_penalty_zs(pbp)

In [None]:
state_playcall_features = [
    *state_features,
    *common_timeout_features,
    *only_off_timeout_features,
    *only_def_timeout_features,
]
playcall_features = [
    *state_playcall_features,
    'offense_penalty_z',
    'defense_penalty_z',
    'off_def_penalty_z',
]
is_down = (dataset['down_1'] + dataset['down_2'] + dataset['down_3'] + dataset['down_4']) == 1
playcall_dataset = (
    dataset[(dataset['playcall'].isin(NON_DEFENSIVE_TIMEOUTS)) & is_down]
    [['playcall','posteam','defteam','season'] + state_playcall_features]
    .merge(off_penalties, how='left', on=['posteam','season'])
    .merge(def_penalties, how='left', on=['defteam','season'])
    .dropna()
)

playcall_dataset['off_def_penalty_z'] = playcall_dataset['offense_penalty_z'] * playcall_dataset['defense_penalty_z']
playcall_dataset['offensive_penalty'] = (playcall_dataset['playcall'] == 'PENALTY_OFFENSE').astype(int)
playcall_dataset['defensive_penalty'] = (playcall_dataset['playcall'] == 'PENALTY_DEFENSE').astype(int)

In [None]:
# print("pub struct PlaycallModel {")
# print("    pub intercept: f32,")
# for f in playcall_features:
#     print(f'    pub {f}: f32,')
# print("}\n\n")

In [None]:
def format_playcall_coefs(pc_model, function_name: str) -> str:
    ret = []
    ret.append(f'\n    pub fn {function_name}() -> PlaycallModel {{')
    ret.append("        PlaycallModel {")
    ret.append(f"            intercept: {pc_model.intercept_[0]:.4f},")
    for f, c in zip(playcall_features, pc_model.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    ret.append("        }")
    ret.append("    }\n")
    return '\n'.join(ret)

In [None]:
off_penalty_model = LogisticRegression(**LOGR_KWARGS)
off_penalty_model.fit(X=playcall_dataset[playcall_features], y=playcall_dataset['offensive_penalty'])

ins_preds = off_penalty_model.predict_proba(playcall_dataset[playcall_features])
ins_preds.mean(axis=0)

In [None]:
def_penalty_dataset = playcall_dataset[playcall_dataset['offensive_penalty'] == 0].reset_index(drop=True)

def_penalty_model = LogisticRegression(**LOGR_KWARGS)
def_penalty_model.fit(X=def_penalty_dataset[playcall_features], y=def_penalty_dataset['defensive_penalty'])

ins_preds = def_penalty_model.predict_proba(def_penalty_dataset[playcall_features])
ins_preds.mean(axis=0)

In [None]:
REAL_PLAYCALLS = {'PUNT', 'QB_KNEEL', 'QB_SPIKE', 'FG_ATTEMPT', 'DESIGNED_RUN', 'DROPBACK'}
qb_kneel_dataset = playcall_dataset[playcall_dataset['playcall'].isin(REAL_PLAYCALLS)].reset_index(drop=True)
qb_kneel_dataset['qb_kneel_playcall'] = (qb_kneel_dataset['playcall'] == 'QB_KNEEL').astype(int)
qb_kneel_model = LogisticRegression(**LOGR_KWARGS)
qb_kneel_model.fit(X=qb_kneel_dataset[playcall_features], y=qb_kneel_dataset['qb_kneel_playcall'])
# ins_pred = qb_kneel_model.predict_proba(qb_kneel_dataset[playcall_features])
# ins_pred.mean(axis=0), qb_kneel_dataset['qb_kneel_playcall'].mean(), ins_pred.max(axis=0)

QB_SPIKE_PLAYCALLS = REAL_PLAYCALLS - {'QB_KNEEL'}
spike_dataset = playcall_dataset[playcall_dataset['playcall'].isin(QB_SPIKE_PLAYCALLS)].reset_index(drop=True)
spike_dataset['qb_spike_playcall'] = (spike_dataset['playcall'] == 'QB_SPIKE').astype(int)
qb_spike_model = LogisticRegression(**LOGR_KWARGS)
qb_spike_model.fit(X=spike_dataset[playcall_features], y=spike_dataset['qb_spike_playcall'])
# ins_pred = qb_spike_model.predict_proba(spike_dataset[playcall_features])
# ins_pred.mean(axis=0), spike_dataset['qb_spike_playcall'].mean(), ins_pred.max(axis=0)


FGA_PLAYCALLS = QB_SPIKE_PLAYCALLS - {'QB_SPIKE'}
fg_dataset = playcall_dataset[playcall_dataset['playcall'].isin(REAL_PLAYCALLS)].reset_index(drop=True)
fg_dataset['fg_attempted'] = (fg_dataset['playcall'] == 'FG_ATTEMPT').astype(int)
fga_model = LogisticRegression(**LOGR_KWARGS)
fga_model.fit(X=fg_dataset[playcall_features], y=fg_dataset['fg_attempted'])
# ins_pred = fga_model.predict_proba(fg_dataset[playcall_features])
# ins_pred.mean(axis=0), fg_dataset['fg_attempted'].mean(), ins_pred.max(axis=0)

PUNT_PLAYCALLS = FGA_PLAYCALLS - {'FG_ATTEMPT'}
punt_dataset = playcall_dataset[playcall_dataset['playcall'].isin(PUNT_PLAYCALLS)].reset_index(drop=True)
punt_dataset['punt_playcall'] = (punt_dataset['playcall'] == 'PUNT').astype(int)
punt_model = LogisticRegression(**LOGR_KWARGS)
punt_model.fit(X=punt_dataset[playcall_features], y=punt_dataset['punt_playcall'])
# ins_pred = punt_model.predict_proba(punt_dataset[playcall_features])
# ins_pred.mean(axis=0), punt_dataset['punt_playcall'].mean(), ins_pred.max(axis=0)

dropback_dataset = playcall_dataset[playcall_dataset['playcall'].isin({'DROPBACK', 'DESIGNED_RUN'})].reset_index(drop=True)
dropback_dataset['is_qb_dropback'] = (dropback_dataset['playcall'] == 'DROPBACK').astype(int)
dropback_model = LogisticRegression(**LOGR_KWARGS)
dropback_model.fit(X=dropback_dataset[playcall_features], y=dropback_dataset['is_qb_dropback'])

In [None]:
# ins_pred = dropback_model.predict_proba(dropback_dataset[playcall_features])
# ins_pred.mean(axis=0), dropback_dataset['is_qb_dropback'].mean(), ins_pred.max(axis=0)

In [None]:
# features = {
#     "clock_running": 0.0,
#     "inv_half_minutes": 0.03337041,
#     "inv_game_minutes": 0.01667593,
#     "possession_diff": 0.0,
#     "fg_possession_diff": 1.0,
#     "possdiff_per_minute": 0.0,
#     "clock_runs_pdpm": 0.0,
#     "clock_runs_fgpdpm": 0.0,
#     "clock_runs_pdpm2": 0.0,
#     "clock_runs_fgpdpm2": 0.0,
#     "off_timeouts_remaining_0": 0.0,
#     "off_timeouts_remaining_1": 0.0,
#     "off_timeouts_remaining_2": 0.0,
#     "off_timeouts_remaining_3": 1.0,
#     "def_timeouts_remaining_0": 0.0,
#     "def_timeouts_remaining_1": 0.0,
#     "def_timeouts_remaining_2": 0.0,
#     "def_timeouts_remaining_3": 1.0,
#     "clock_runs_pdpm_off0to": 0.0,
#     "clock_runs_pdpm_off1to": 0.0,
#     "clock_runs_pdpm_off2to": 0.0,
#     "clock_runs_pdpm_off3to": 0.0,
#     "clock_runs_pdpm_def0to": 0.0,
#     "clock_runs_pdpm_def1to": 0.0,
#     "clock_runs_pdpm_def2to": 0.0,
#     "clock_runs_pdpm_def3to": 0.0,
#     "is_offense_home": 0.0,
#     "offense_proe": -0.11,
#     "defense_proe": -0.02,
#     "offense_penalty_z": 0.68,
#     "defense_penalty_z": -0.56,
#     "down_1": 1.0,
#     "down_2": 0.0,
#     "down_3": 0.0,
#     "down_4": 0.0,
#     "goal_to_go": 0.0,
#     "z_ydstogo": 0.0,
#     "ydstogo_pct": 1.0,
#     "to_go_1st": 0.36787945,
#     "to_go_2nd": 0.0,
#     "to_go_3rd": 0.0,
#     "to_go_4th": 0.0,
#     "fp_1st": 1.0,
#     "fp_2nd": 0.0,
#     "fp_3rd": 0.0,
#     "fp_4th": 0.0,
#     "yardline_pct": 0.75,
#     "yardline_pct_sq": 0.5625,
#     "fgpossdiff_per_minute": 0.01667593,
#     "goal_to_go_yardline": 0.0,
#     "yards_to_go_yardline": 0.75,
#     "yardline_4th": 0.0,
#     "yardline_not_4th": 0.75,
#     "inside_2m_warning": 0.0,
#     "garbage_time_win": 0.0,
#     "garbage_time_loss": 0.0,
#     "oline_rushing_z": 0.49,
#     "dline_rushing_z": 0.83,
#     "offense_pace_z": -0.02,
#     "offense_pass_rush_z": -2.02,
#     "defense_pass_rush_z": 0.84,
#     "qb_scramble_rate": 0.125,
#     "qb_prob_sack_given_hit": 0.56
# }

# contribs = sorted([
#     (c * features[f], f)
#     for f, c in zip(playcall_features, dropback_model.coef_[0])
#     if c * features[f]
# ], reverse=True)
# z = sum(c for c, f in contribs) + dropback_model.intercept_[0]
# print(z, 1 / (1 + math.exp(-z)))
# contribs

In [None]:
model_str_parts = ["use crate::models::playcall::PlaycallModel;\n", "impl PlaycallModel {"]
model_str_parts.append(format_playcall_coefs(off_penalty_model, "offensive_penalty_coefs"))
model_str_parts.append(format_playcall_coefs(def_penalty_model, "defensive_penalty_coefs"))
model_str_parts.append(format_playcall_coefs(fga_model, "fg_attempt_coefs"))
model_str_parts.append(format_playcall_coefs(punt_model, "punt_coefs"))
model_str_parts.append(format_playcall_coefs(qb_spike_model, "qb_spike_coefs"))
model_str_parts.append(format_playcall_coefs(qb_kneel_model, "qb_kneel_coefs"))
model_str_parts.append(format_playcall_coefs(dropback_model, "qb_dropback_coefs"))
model_str_parts.append("}")

model_str = '\n'.join(model_str_parts)

with open(f'{MODELS_PATH}/playcall/coef.rs', 'w') as f:
    f.write(model_str)

In [None]:
# print("\n".join(list(pbp_copy.columns)))

In [None]:
RUN_CLOCK_PLAY_TYPES = {
    'field_goal',
    'kickoff',
    'no_play',
    'pass',
    'punt',
    'run',
}


MIN_NEXT_PLAY_CLOCK = 1
MAX_NEXT_PLAY_CLOCK = 36

def _is_game_clock_format(t):
    return len(t.split(':')) == 2

def _parse_quarter_seconds(t: str) -> int:
    mins, secs = t.split(':')
    return 60 * int(mins) + int(secs)

def parse_play_duration(r):
    if r['end_clock_time'] is None:
        return None
    is_end_game_clock = _is_game_clock_format(r['end_clock_time'])
    if is_end_game_clock:
        val = r['quarter_seconds_remaining'] - _parse_quarter_seconds(r['end_clock_time'])
        return val if 0 < val <= 40 else None

    # the end is a datetime value, so our only hope is that the time_of_day (play start time) is also a datetime
    if r['time_of_day'] is None:
        return None
    if _is_game_clock_format(r['time_of_day']):
        return None

    val = (parse_date(r['end_clock_time']) - parse_date(r['time_of_day'])).total_seconds()        
    return val if 0 < val <= 40 else None    

In [None]:
play_clock_df = (
    dataset.drop_duplicates(subset=['play_id', 'game_id'])
    .merge(pbp[['play_id','game_id','time_of_day', 'end_clock_time', 'play_type','touchback','yards_gained','clock_runs_after','clock_pauses_after','clock_stops_after','quarter_seconds_remaining','next_game_seconds_remaining','next_half_seconds_remaining', 'play_clock', 'next_quarter_seconds_remaining','penalty_type']],
           how='left', on=['play_id','game_id'])
)
play_clock_df = play_clock_df[play_clock_df['play_type'].isin(RUN_CLOCK_PLAY_TYPES)].reset_index(drop=True)

play_type_dummies = pd.get_dummies(play_clock_df['play_type'], prefix='play_type')
play_clock_df[play_type_dummies.columns] = play_type_dummies

play_duration_features = [
    *list(play_type_dummies.columns),
    'yards_gained_pct',
    'yards_gained_pct_sq',
    *[f'play_type_{pt}_yards_pct' for pt in {'run', 'pass'}],
    *[f'play_type_{pt}_yards_pct_sq' for pt in {'run', 'pass'}],
    *all_timeout_features,
]

play_clock_df['yards_gained_pct'] = play_clock_df['yards_gained'] / 100.0
play_clock_df['yards_gained_pct_sq'] = play_clock_df['yards_gained_pct'] ** 2
for play_type in {'run', 'pass'}:
    play_clock_df[f'play_type_{play_type}_yards_pct'] = play_clock_df[f'play_type_{play_type}'] * play_clock_df['yards_gained_pct']
    play_clock_df[f'play_type_{play_type}_yards_pct_sq'] = play_clock_df[f'play_type_{play_type}'] * play_clock_df['yards_gained_pct_sq']

play_clock_df['next_play_clock'] = play_clock_df['play_clock'].shift(-1).fillna(0).astype(int).clip(MIN_NEXT_PLAY_CLOCK, MAX_NEXT_PLAY_CLOCK)
# play_clock_df['next_play_delay_of_game'] = (play_clock_df['penalty_type'].shift(-1) == 'Delay of Game').astype(int)
play_clock_df['next_play_type'] = play_clock_df['play_type'].shift(-1).fillna('no_play')

# use only times the play clock stops after,
# so we can capture the play length and not the runoff afterwards
play_clock_df['play_duration'] = play_clock_df.apply(parse_play_duration, axis=1)
timed_plays = (
    (play_clock_df['play_type'].isin(RUN_CLOCK_PLAY_TYPES))
    & (play_clock_df['quarter_seconds_remaining'] != 0) 
    & (play_clock_df['next_quarter_seconds_remaining'] != 0) 
    & (play_clock_df['next_half_seconds_remaining'] != 120)
    # make sure we are in the same quarter
    & (play_clock_df['quarter_seconds_remaining'] >= play_clock_df['next_quarter_seconds_remaining'])
    & ~play_clock_df['play_duration'].isna()
    & (play_clock_df['next_play_clock'] < 40)
    & (play_clock_df['next_play_clock'] >= 0)
)

play_clock_df.loc[(play_clock_df['next_play_clock'] == 0), 'next_play_clock'] = 1

In [None]:
play_duration_df = play_clock_df[timed_plays][['play_duration'] + play_duration_features].dropna().reset_index(drop=True)
run_time_df = play_clock_df[timed_plays & (play_clock_df['clock_runs_after'] == 1)][['play_duration', 'next_play_type', 'next_play_clock', 'season', 'posteam', 'quarter_seconds_remaining', 'next_quarter_seconds_remaining'] + play_duration_features].dropna().reset_index(drop=True)
pause_time_df = play_clock_df[timed_plays & (play_clock_df['clock_pauses_after'] == 1)][['play_duration', 'next_play_type', 'next_play_clock', 'quarter_seconds_remaining', 'next_quarter_seconds_remaining', 'time_of_day', 'end_clock_time', 'season', 'posteam'] + play_duration_features].dropna().reset_index(drop=True)

In [None]:
play_duration_model = PoissonRegressor(**POIS_KWARGS)
play_duration_model.fit(X=play_duration_df[play_duration_features], y=play_duration_df['play_duration'])

In [None]:
time_to_spot_features = [*play_duration_features, 'play_duration']

"""
(qtr - nextQtr) + timeBeforeSpot = playDuration + (40 - nextPlayClock)
timeBeforeSpot = playDuration + (40 - nextPlayClock) - (qtr - nextQtr)

runoff = 40 - nextPlayClock - timeBeforeSpot

nextPlayClock = 40 + playDuration - timeBeforeSpot - (qtr - nextQtr)
"""

pause_time_df['play_duration'] = play_duration_model.predict(pause_time_df[play_duration_features]).clip(2, 15)

# TODO: see if they add 2023 eventually
# no play clock data for 2023+
BEFORE_HAS_PLAY_CLOCK = 2023

pause_has_play_clock = (pause_time_df['season'] < BEFORE_HAS_PLAY_CLOCK)
pause_time_df.loc[pause_has_play_clock, 'time_before_spot'] = (
    pause_time_df['play_duration'] 
    + (40 - pause_time_df['next_play_clock'])
    - (pause_time_df['quarter_seconds_remaining'] - pause_time_df['next_quarter_seconds_remaining'])
)
valid_tts = (
    pause_has_play_clock
    & (1 < pause_time_df['time_before_spot'])
    & (pause_time_df['time_before_spot'] < 10)
    & (pause_time_df['next_play_type'].isin({'pass','run','punt','field_goal', 'no_play'}))
)

time_to_spot_model = Ridge(**POIS_KWARGS)
time_to_spot_model.fit(X=pause_time_df[valid_tts][time_to_spot_features], y=pause_time_df[valid_tts]['time_before_spot'])
tts_pred = time_to_spot_model.predict(pause_time_df[valid_tts][time_to_spot_features])

pause_time_df.loc[~pause_has_play_clock, 'time_before_spot'] = time_to_spot_model.predict(pause_time_df[~pause_has_play_clock][time_to_spot_features])

pause_time_df['next_play_clock'] = (
    40
    + pause_time_df['play_duration']
    - pause_time_df['time_before_spot']
    - (pause_time_df['quarter_seconds_remaining'] - pause_time_df['next_quarter_seconds_remaining'])
).clip(MIN_NEXT_PLAY_CLOCK, MAX_NEXT_PLAY_CLOCK)

valid_pauses = (
    (pause_time_df['next_play_clock'] >= MIN_NEXT_PLAY_CLOCK)
    & (pause_time_df['next_play_clock'] <= MAX_NEXT_PLAY_CLOCK)
    & (pause_time_df['next_play_type'].isin({'pass','run','punt','field_goal', 'no_play'}))
)

In [None]:
run_has_play_clock = (run_time_df['season'] < BEFORE_HAS_PLAY_CLOCK)
run_time_df.loc[~run_has_play_clock, 'next_play_clock'] = (
    40
    + run_time_df['play_duration']
    - (run_time_df['quarter_seconds_remaining'] - run_time_df['next_quarter_seconds_remaining'])
).clip(MIN_NEXT_PLAY_CLOCK, MAX_NEXT_PLAY_CLOCK)

valid_runoffs = (
    (run_time_df['next_play_clock'] >= MIN_NEXT_PLAY_CLOCK)
    & (run_time_df['next_play_clock'] <= MAX_NEXT_PLAY_CLOCK)
    & (run_time_df['next_play_type'].isin({'pass','run','punt','field_goal', 'no_play'}))
)

In [None]:
paused_pace_model = Ridge(**POIS_KWARGS)
paused_pace_model.fit(X=pause_time_df[valid_pauses][time_to_spot_features], y=pause_time_df[valid_pauses]['next_play_clock'])
pause_time_df.loc[valid_pauses, 'expected_next_play_clock'] = paused_pace_model.predict(pause_time_df[valid_pauses][time_to_spot_features]).clip(1, 32)

running_pace_model = Ridge(**POIS_KWARGS)
running_pace_model.fit(X=run_time_df[valid_runoffs][time_to_spot_features], y=run_time_df[valid_runoffs]['next_play_clock'])
run_time_df.loc[valid_runoffs, 'expected_next_play_clock'] = running_pace_model.predict(run_time_df[valid_runoffs][time_to_spot_features]).clip(1, 32)
# runoff_resid = (runoff_pred - runoff_df['est_runoff']).var()
# print(f'const RUNOFF_RESID: f32 = {runoff_resid:.4f};')

In [None]:
team_paces = (
    pd.concat([run_time_df[valid_runoffs], pause_time_df[valid_pauses]])
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'expected_next_play_clock': 'mean', 'next_play_clock': 'mean'})
    .sort_values('next_play_clock', ascending=False)
    # .rename(columns={'next_play_clock': 'pace'})
    .reset_index(drop=True)
)

team_paces['next_play_clock_oe'] = (team_paces['next_play_clock'] - team_paces['expected_next_play_clock'])
team_paces['pace_z'] = (team_paces['next_play_clock_oe'] - team_paces['next_play_clock_oe'].mean()) / team_paces['next_play_clock_oe'].std()

# team_paces['log_next_play_clock'] = np.log(team_paces['expected_next_play_clock'])
# team_paces['log_pace_z'] = (team_paces['log_next_play_clock'] - team_paces['log_next_play_clock'].mean()) / team_paces['log_next_play_clock'].std()

In [None]:
extra_clock_time_features = [
    'pace_z',
    # 'next_play_clock',
    # 'log_pace_z'
]
clock_time_features = [*time_to_spot_features, *extra_clock_time_features]

pause_df = pause_time_df.merge(team_paces[['season','posteam', *extra_clock_time_features]], how='left', on=['season','posteam'])

paused_next_play_clock_model = Ridge(**POIS_KWARGS)
paused_next_play_clock_model.fit(X=pause_df[valid_pauses][clock_time_features], y=pause_df[valid_pauses]['next_play_clock'])
pause_pred = paused_next_play_clock_model.predict(pause_df[valid_pauses][clock_time_features])

runoff_df = run_time_df.merge(team_paces[['season','posteam', *extra_clock_time_features]], how='left', on=['season','posteam'])
running_next_play_clock_model = Ridge(**POIS_KWARGS)
running_next_play_clock_model.fit(X=runoff_df[valid_runoffs][clock_time_features], y=runoff_df[valid_runoffs]['next_play_clock'])
runoff_pred = running_next_play_clock_model.predict(runoff_df[valid_runoffs][clock_time_features])
# runoff_resid = (runoff_pred - runoff_df['est_runoff']).var()
# print(f'const RUNOFF_RESID: f32 = {runoff_resid:.4f};')

In [None]:
pause_pred.mean(), runoff_pred.mean()

In [None]:
# print('pub struct ClockModel {')
# print('    intercept: f32,')
# for ft in [*play_duration_features, 'play_duration', 'pace_z']:
#     print(f'    {ft}: f32,')
# print('}')

In [None]:
def format_clock_model(model: PoissonRegressor | Ridge, name: str) -> str:
    ret = []
    ret.append(f'\n    pub fn {name}() -> ClockModel {{')
    ret.append("        ClockModel {")
    ret.append(f"            intercept: {model.intercept_:.4f},")
    for f, c in zip(clock_time_features, model.coef_):
        ret.append(f'            {f}: {c:.4f},')
    for f in clock_time_features[len(model.coef_):]:
        ret.append(f'            {f}: 0.0,')
    ret.append("        }")
    ret.append("    }\n")
    return '\n'.join(ret)

In [None]:
clock_model_str_parts = ["use crate::models::clock::ClockModel;\n", "impl ClockModel {"]
clock_model_str_parts.append(format_clock_model(play_duration_model, "play_duration_coefs"))
clock_model_str_parts.append(format_clock_model(time_to_spot_model, "time_to_spot_coefs"))
clock_model_str_parts.append(format_clock_model(paused_next_play_clock_model, "paused_next_play_clock_coefs"))
clock_model_str_parts.append(format_clock_model(running_next_play_clock_model, "running_next_play_clock_coefs"))
clock_model_str_parts.append("}")

clock_model_str = '\n'.join(clock_model_str_parts)

with open(f'{MODELS_PATH}/clock/coef.rs', 'w') as f:
    f.write(clock_model_str)

In [None]:
def team_proj_season_weight(season: int) -> float:
    if season + 1 < CURRENT_SEASON:
        return 0.0
    return math.exp(1.0 * (season - CURRENT_SEASON))

In [None]:
games_per_season = (
    pbp.groupby(['season','posteam'], as_index=False)
    .aggregate({'game_id': 'nunique'})
    .rename(columns={'posteam': 'team', 'game_id': 'games'})
)
games_per_season.head()

In [None]:
team_projs = (
    games_per_season
    .merge(team_paces.rename(columns={'posteam':'team'}))
    .merge(offense_proes.rename(columns={'posteam':'team'}), how='inner', on=['season', 'team'])
    .merge(defense_proes.rename(columns={'defteam':'team'}), how='inner', on=['season', 'team'])
    .merge(offense_rz_proes.rename(columns={'posteam':'team'}), how='inner', on=['season', 'team'])
    .merge(defense_rz_proes.rename(columns={'defteam':'team'}), how='inner', on=['season', 'team'])
    .merge(off_penalties.rename(columns={'posteam':'team'}), how='inner', on=['season', 'team'])
    .merge(def_penalties.rename(columns={'defteam': 'team'}), how='inner', on=['season', 'team'])
)

team_projs['season_weight'] = team_projs['games'] * team_projs['season'].apply(team_proj_season_weight)
avg_cols = ['pace_z', 'offense_proe', 'offense_rz_proe', 'defense_proe', 'defense_rz_proe', 'offense_penalty_z', 'defense_penalty_z']
for c in avg_cols:
    team_projs[c] = team_projs[c] * team_projs['season_weight']
team_season_projs = team_projs.groupby('team', as_index=False).aggregate({c: 'sum' for c in ['season_weight', *avg_cols]})

for c in avg_cols:
    round_to = 2 if c.endswith('_z') else 3
    team_season_projs[c] = (team_season_projs[c] / team_season_projs['season_weight']).round(decimals=round_to)

team_season_projs.drop(columns=['season_weight'], inplace=True)
team_season_projs

In [None]:
team_season_projs.to_csv(f'{BASELINES_PATH}/teams.csv', index=False)