In [None]:
from typing import List
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, PoissonRegressor

from pbp.data import (
    CURRENT_SEASON,
    MODELS_PATH,
    LOGR_KWARGS,
    POIS_KWARGS,
    add_possdiff_features,
    load_pbp_data,
    possdiff_plus_n,
)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
MIN_KICKOFF_DISTANCE = 25

In [None]:
try:
    pbp.head(1)
except:
    pbp_copy = load_pbp_data(range(2019, CURRENT_SEASON + 1))

In [None]:
pbp = pbp_copy.copy()

drive_keys = ['game_id','drive','posteam','defteam']
drive_teams = (
    pbp[~pbp['posteam'].isna() & ~pbp['defteam'].isna() & ~pbp['posteam_score'].isna() & ~pbp['defteam_score'].isna()]
    [drive_keys]
    .groupby(drive_keys, as_index=False)
    .nth(0)
    .rename(columns={'posteam': 'drive_offense', 'defteam': 'drive_defense'})
)
pbp['playcall'] = ''
pbp = pbp.merge(drive_teams, on=['game_id', 'drive'], how='left').drop_duplicates(subset=['play_id','game_id'])

pbp['off_score'] = pbp['posteam_score']
pbp['def_score'] = pbp['defteam_score']

In [None]:
timeouts = (pbp['play_type'] == 'no_play') & (pbp['timeout'] == 1)
off_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_offense'])
def_timeout = timeouts & (pbp['timeout_team'] == pbp['drive_defense'])
tv_timeout = timeouts & (~off_timeout) & (~def_timeout)

pbp.loc[off_timeout, 'playcall'] = 'OFFENSIVE_TIMEOUT'
pbp.loc[def_timeout, 'playcall'] = 'DEFENSIVE_TIMEOUT'

pbp['home_timeout'] = 0
pbp['away_timeout'] = 0

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_offense'] == pbp['home_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_defense'] == pbp['home_team']).astype(int)

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_offense'] == pbp['away_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_defense'] == pbp['away_team']).astype(int)

pbp['home_timeout_prev'] = pbp['home_timeout'].shift(1)
pbp['away_timeout_prev'] = pbp['away_timeout'].shift(1)
pbp['duplicate_timeout'] = (
    ((pbp['home_timeout_prev'] == 1) & (pbp['home_timeout'] == 1))
    | ((pbp['away_timeout_prev'] == 1) & (pbp['away_timeout'] == 1))
)
pbp.loc[pbp['home_timeout_prev'] == 1, 'home_timeout'] = 0
pbp.loc[pbp['away_timeout_prev'] == 1, 'away_timeout'] = 0

pbp.loc[pbp['duplicate_timeout'], 'timeout'] = 'DUPLICATE'
pbp.loc[pbp['duplicate_timeout'], 'playcall'] = 'DUPLICATE_TIMEOUT'

pbp['total_home_timeouts'] = pbp.groupby(['game_id','home_team', 'game_half'])['home_timeout'].cumsum()
pbp['total_away_timeouts'] = pbp.groupby(['game_id','away_team', 'game_half'])['away_timeout'].cumsum()

pbp['off_timeouts_left'] = 3
pbp['def_timeouts_left'] = 3

pbp.loc[pbp['drive_offense'] == pbp['home_team'], 'off_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_offense'] == pbp['away_team'], 'off_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp.loc[pbp['drive_defense'] == pbp['home_team'], 'def_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_defense'] == pbp['away_team'], 'def_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp['off_timeouts_remaining'] = pbp['posteam_timeouts_remaining'].combine_first(pbp['off_timeouts_left']).astype(int).clip(0, 3)
pbp['def_timeouts_remaining'] = pbp['defteam_timeouts_remaining'].combine_first(pbp['def_timeouts_left']).astype(int).clip(0, 3)


In [None]:
kickoff_filter = pbp['play_type'] == 'kickoff'
kickoff_df = pbp[kickoff_filter].copy()

In [None]:
kickoff_cols = [
    'kick_distance',
    'touchback',
    'own_kickoff_recovery',
    'return_yards',
    'return_touchdown',
    'yardline_100',
    'game_seconds_remaining',
    'half_seconds_remaining',
    'off_timeouts_remaining',
    'def_timeouts_remaining',
    'home_team',
    'posteam',
    'defteam',
    'off_score',
    'def_score',
    'location',
]

possdiff_n = [1, 2]

kickoff_features = [
    'inv_half_minutes',
    'log_inv_half_minutes',
    'inv_game_minutes',
    'log_inv_game_minutes',
    'possession_diff',
    'fg_possession_diff',
    'possdiff_per_minute',
    'fgpossdiff_per_minute',
    'off_timeouts_remaining_0',
    'off_timeouts_remaining_1',
    'off_timeouts_remaining_2',
    'off_timeouts_remaining_3',
    'def_timeouts_remaining_0',
    'def_timeouts_remaining_1',
    'def_timeouts_remaining_2',
    'def_timeouts_remaining_3',
    'garbage_time_win',
    'garbage_time_loss',
    *[f for n in possdiff_n for f in [f'possdiff_plus_{n}', f'possdiff_p{n}_per_minute']],
    'kicked_from_pct',
    'log_kicked_from_pct',
]

In [None]:
dataset = pd.get_dummies(kickoff_df[kickoff_cols], columns=['off_timeouts_remaining', 'def_timeouts_remaining'])

# out of bounds kicks = 25 yards
dataset['onside_kick'] = (dataset['kick_distance'] < 24).astype(int)
dataset['out_of_bounds_kick'] = (dataset['kick_distance'] == 25).astype(int)
dataset['deep_kick'] = ((dataset['kick_distance'] > 25) & (dataset['touchback'] == 0)).astype(int)
dataset['kicked_from_pct'] = dataset['yardline_100'] / 100.0

add_possdiff_features(dataset)
for n in possdiff_n:
    dataset[f'possdiff_plus_{n}'] = dataset.apply(lambda r: possdiff_plus_n(r, n), axis=1)
    dataset[f'possdiff_p{n}_per_minute'] = dataset[f'possdiff_plus_{n}'] / dataset['game_minutes_left']
    
dataset['log_kicked_from_pct'] = np.log(dataset['kicked_from_pct'].clip(0.01, 1))
dataset['log_inv_half_minutes'] = np.log(dataset['inv_half_minutes'])
dataset['log_inv_game_minutes'] = np.log(dataset['inv_game_minutes'])

In [None]:
onside_kick_model = LogisticRegression(**LOGR_KWARGS)
onside_kick_model.fit(dataset[kickoff_features], dataset['onside_kick'])

In [None]:
is_onside = dataset['onside_kick'] == 0

out_of_bounds_kick_model = LogisticRegression(**LOGR_KWARGS)
out_of_bounds_kick_model.fit(dataset[is_onside][kickoff_features], dataset[is_onside]['out_of_bounds_kick'])

In [None]:
deep_kicks = (dataset['deep_kick'] == 1) | (dataset['touchback'] == 1) | (dataset['out_of_bounds_kick'] == 1)
touchback_model = LogisticRegression(**LOGR_KWARGS)
touchback_model.fit(dataset[deep_kicks][kickoff_features], dataset[deep_kicks]['touchback'])

In [None]:
distance_features = [*kickoff_features]
deep_kicks_df = dataset[dataset['deep_kick'] == 1].copy()

deep_kicks_df['kicked_to_pct'] = deep_kicks_df['kicked_from_pct'] + deep_kicks_df['kick_distance'] / 100.0
deep_kicks_df['log_kicked_to_pct'] = np.log(deep_kicks_df['kicked_to_pct'].clip(0.01, 1.0))

In [None]:
kickoff_distance_model = PoissonRegressor()
kickoff_distance_model.fit(deep_kicks_df[distance_features], deep_kicks_df['kick_distance'] - MIN_KICKOFF_DISTANCE)

In [None]:
kickoff_dist_features = [*distance_features, 'kicked_to_pct', 'log_kicked_to_pct']
kickoff_return_td_model = LogisticRegression(**LOGR_KWARGS)
kickoff_return_td_model.fit(deep_kicks_df[kickoff_dist_features], deep_kicks_df['return_touchdown'])

In [None]:
# always assume it's for >= 0 yardage
returned_kickoffs_no_td = deep_kicks_df[(deep_kicks_df['return_yards'] >= 0) & (deep_kicks_df['return_touchdown'] == 0)].copy()
kick_return_yards_model = PoissonRegressor(**POIS_KWARGS)
kick_return_yards_model.fit(returned_kickoffs_no_td[kickoff_dist_features], returned_kickoffs_no_td['return_yards'])
ins_preds = kick_return_yards_model.predict(returned_kickoffs_no_td[kickoff_dist_features])

kick_return_yards_var_model = PoissonRegressor(**POIS_KWARGS)
kick_return_yards_var_model.fit(returned_kickoffs_no_td[kickoff_dist_features], (returned_kickoffs_no_td['return_yards'] - ins_preds) ** 2)

In [None]:
# print('pub struct KickoffModel {')
# print('    intercept: f32,')
# for feature in kickoff_dist_features:
#     print(f'    {feature}: f32,')
# print('}')

In [None]:
def format_logr_coefs(model: LogisticRegression, name: str) -> List[str]:
    str_parts = [f"    pub fn {name}_coef() -> KickoffModel {{"]
    str_parts.append("        KickoffModel {")
    str_parts.append(f"            intercept: {model.intercept_[0]:.4f},")
    for coef, name in zip(model.coef_[0], kickoff_dist_features):
        str_parts.append(f"            {name}: {coef:.4f},")
    for f in kickoff_dist_features[len(model.coef_[0]):]:
        str_parts.append(f"            {f}: 0.0,")
    str_parts.append("        }")
    str_parts.append("    }")
    return str_parts

def format_linr_coefs(model: PoissonRegressor, name: str) -> List[str]:
    str_parts = [f"    pub fn {name}_coef() -> KickoffModel {{"]
    str_parts.append("        KickoffModel {")
    str_parts.append(f"            intercept: {model.intercept_:.4f},")
    for coef, name in zip(model.coef_, kickoff_dist_features):
        str_parts.append(f"            {name}: {coef:.4f},")
    for f in kickoff_dist_features[len(model.coef_):]:
        str_parts.append(f"            {f}: 0.0,")
    str_parts.append("        }")
    str_parts.append("    }")
    return str_parts

In [None]:

model_str_parts = ["use crate::models::kickoff::KickoffModel;", "\nimpl KickoffModel {"]
model_str_parts.extend(format_logr_coefs(onside_kick_model, "onside_kick"))
model_str_parts.extend(format_logr_coefs(out_of_bounds_kick_model, "out_of_bounds_kick"))
model_str_parts.extend(format_logr_coefs(touchback_model, "touchback"))
model_str_parts.extend(format_linr_coefs(kickoff_distance_model, "kickoff_distance"))
model_str_parts.extend(format_logr_coefs(kickoff_return_td_model, "kickoff_return_td"))
model_str_parts.extend(format_linr_coefs(kick_return_yards_model, "kickoff_return_yards"))
model_str_parts.extend(format_linr_coefs(kick_return_yards_var_model, "kickoff_return_yards_var"))
model_str_parts.append("}")

with open(f'{MODELS_PATH}/kickoff/coef.rs', 'w') as f:
    f.write("\n".join(model_str_parts))