In [None]:
from typing import List
import pandas as pd
from sklearn.linear_model import LogisticRegression

from pbp.data import (
    CURRENT_SEASON,
    MODELS_PATH,
    LOGR_KWARGS,
    add_possdiff_features,
    load_pbp_data,
    possdiff_plus_n,
)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
try:
    pbp.head(1)
except:
    pbp_copy = load_pbp_data(range(2019, CURRENT_SEASON + 1))

In [None]:
pbp = pbp_copy.copy()

drive_keys = ['game_id','drive','posteam','defteam']
drive_teams = (
    pbp[~pbp['posteam'].isna() & ~pbp['defteam'].isna() & ~pbp['posteam_score'].isna() & ~pbp['defteam_score'].isna()]
    [drive_keys]
    .groupby(drive_keys, as_index=False)
    .nth(0)
    .rename(columns={'posteam': 'drive_offense', 'defteam': 'drive_defense'})
)
pbp['playcall'] = ''
pbp = pbp.merge(drive_teams, on=['game_id', 'drive'], how='left').drop_duplicates(subset=['play_id','game_id'])

pbp['off_score'] = pbp['posteam_score']
pbp['def_score'] = pbp['defteam_score']

In [None]:
timeouts = (pbp['play_type'] == 'no_play') & (pbp['timeout'] == 1)
off_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_offense'])
def_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_defense'])
tv_timeout = timeouts & (~off_timeout) & (~def_timeout)

pbp.loc[off_timeout, 'playcall'] = 'OFFENSIVE_TIMEOUT'
pbp.loc[def_timeout, 'playcall'] = 'DEFENSIVE_TIMEOUT'

pbp['home_timeout'] = 0
pbp['away_timeout'] = 0

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_offense'] == pbp['home_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_defense'] == pbp['home_team']).astype(int)

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_offense'] == pbp['away_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_defense'] == pbp['away_team']).astype(int)

pbp['home_timeout_prev'] = pbp['home_timeout'].shift(1)
pbp['away_timeout_prev'] = pbp['away_timeout'].shift(1)
pbp['duplicate_timeout'] = (
    ((pbp['home_timeout_prev'] == 1) & (pbp['home_timeout'] == 1))
    | ((pbp['away_timeout_prev'] == 1) & (pbp['away_timeout'] == 1))
)
pbp.loc[pbp['home_timeout_prev'] == 1, 'home_timeout'] = 0
pbp.loc[pbp['away_timeout_prev'] == 1, 'away_timeout'] = 0

pbp.loc[pbp['duplicate_timeout'], 'timeout'] = 'DUPLICATE'
pbp.loc[pbp['duplicate_timeout'], 'playcall'] = 'DUPLICATE_TIMEOUT'

pbp['total_home_timeouts'] = pbp.groupby(['game_id','home_team', 'game_half'])['home_timeout'].cumsum()
pbp['total_away_timeouts'] = pbp.groupby(['game_id','away_team', 'game_half'])['away_timeout'].cumsum()

pbp['off_timeouts_left'] = 3
pbp['def_timeouts_left'] = 3

pbp.loc[pbp['drive_offense'] == pbp['home_team'], 'off_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_offense'] == pbp['away_team'], 'off_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp.loc[pbp['drive_defense'] == pbp['home_team'], 'def_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_defense'] == pbp['away_team'], 'def_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp['off_timeouts_remaining'] = pbp['posteam_timeouts_remaining'].combine_first(pbp['off_timeouts_left']).astype(int).clip(0, 3)
pbp['def_timeouts_remaining'] = pbp['defteam_timeouts_remaining'].combine_first(pbp['def_timeouts_left']).astype(int).clip(0, 3)


In [None]:
pat_filter = (pbp['extra_point_attempt'] == 1.0) | (pbp['two_point_attempt'] == 1.0)
pat_df = pbp[pat_filter].copy()

In [None]:
pat_cols = [
    'two_point_attempt',
    'game_seconds_remaining',
    'half_seconds_remaining',
    'off_timeouts_remaining',
    'def_timeouts_remaining',
    'home_team',
    'posteam',
    'defteam',
    'off_score',
    'def_score',
    'location',
]

possdiff_n = [1, 2]

pat_features = [
    'inv_half_minutes',
    'log_inv_half_minutes',
    'inv_game_minutes',
    'log_inv_game_minutes',
    'possession_diff',
    'fg_possession_diff',
    'possdiff_per_minute',
    'fgpossdiff_per_minute',
    'off_timeouts_remaining_0',
    'off_timeouts_remaining_1',
    'off_timeouts_remaining_2',
    'off_timeouts_remaining_3',
    'def_timeouts_remaining_0',
    'def_timeouts_remaining_1',
    'def_timeouts_remaining_2',
    'def_timeouts_remaining_3',
    'garbage_time_win',
    'garbage_time_loss',
    *[f for n in possdiff_n for f in [f'possdiff_plus_{n}', f'possdiff_p{n}_per_minute']],
]

In [None]:
dataset = pd.get_dummies(pat_df[pat_cols], columns=['off_timeouts_remaining', 'def_timeouts_remaining'])


add_possdiff_features(dataset)
for n in possdiff_n:
    dataset[f'possdiff_plus_{n}'] = dataset.apply(lambda r: possdiff_plus_n(r, n), axis=1)
    dataset[f'possdiff_p{n}_per_minute'] = dataset[f'possdiff_plus_{n}'] / dataset['game_minutes_left']

In [None]:
go_for_2_model = LogisticRegression(**LOGR_KWARGS)
go_for_2_model.fit(dataset[pat_features], dataset['two_point_attempt'])

In [None]:
# print('pub struct TwoPointAttemptModel {')
# print('    intercept: f32,')
# for feature in pat_features:
#     print(f'    {feature}: f32,')
# print('}')

In [None]:

def format_go_for_2_coefs(model: LogisticRegression) -> List[str]:
    str_parts = ["    pub fn coefs() -> TwoPointAttemptModel {"]
    str_parts.append("        TwoPointAttemptModel {")
    str_parts.append(f"            intercept: {model.intercept_[0]:.4f},")
    for coef, name in zip(model.coef_[0], pat_features):
        str_parts.append(f"            {name}: {coef:.4f},")
    str_parts.append("        }")
    str_parts.append("    }")
    return str_parts

In [None]:

model_str_parts = ["use crate::models::two_point_attempt::TwoPointAttemptModel;", "\nimpl TwoPointAttemptModel {"]
model_str_parts.extend(format_go_for_2_coefs(go_for_2_model))
model_str_parts.append("}")

with open(f'{MODELS_PATH}/two_point_attempt/coef.rs', 'w') as f:
    f.write("\n".join(model_str_parts))