In [None]:
import pandas as pd
import numpy as np
import nfl_data_py
import math
import os
from typing import List
from sklearn.linear_model import LogisticRegression, Ridge, PoissonRegressor

from pbp.data import (
    CURRENT_SEASON,
    BASELINES_PATH,
    MODELS_PATH,
    LOGR_KWARGS,
    POIS_KWARGS,
    state_features,
    add_clock_stops,
    add_playcall_features,
    load_pbp_data,
    make_penalty_zs,
    make_proe_data,
)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
SEASONS = list(range(2017, CURRENT_SEASON + 1))

YARDS_PER_DESIGNED_RUN = 4.25
YARDS_PER_SCRAMBLE = 7.35
RUSHING_EPSILON = 0.01

In [None]:
CARRY_ELIGIBLE_POSITIONS = {'QB','RB','FB','WR','TE'}
this_season_rosters = nfl_data_py.import_seasonal_rosters(years=[CURRENT_SEASON])
rosters = (
    this_season_rosters[this_season_rosters['position'].isin(CARRY_ELIGIBLE_POSITIONS)]
    .rename(columns={'player_id': 'rusher_id'})
)

players = nfl_data_py.import_players()
rusher_names = players[['gsis_id','display_name','position']].rename(columns={'gsis_id': 'rusher_id', 'display_name': 'name'})

In [None]:
try:
    pbp.head(1)
except:
    pbp_copy = load_pbp_data(SEASONS)

In [None]:
pbp = pbp_copy.copy()

drive_keys = ['game_id','drive','posteam','defteam', 'posteam_score', 'defteam_score']
drive_teams = (
    pbp[~pbp['posteam'].isna() & ~pbp['defteam'].isna() & ~pbp['posteam_score'].isna() & ~pbp['defteam_score'].isna()]
    [drive_keys]
    .groupby(drive_keys, as_index=False)
    .nth(0)
    .rename(columns={'posteam': 'drive_offense', 'defteam': 'drive_defense', 'posteam_score': 'off_score', 'defteam_score': 'def_score'})
)
pbp['playcall'] = ''
pbp = pbp.merge(drive_teams, on=['game_id', 'drive'], how='left').drop_duplicates(subset=['play_id','game_id'])

pbp.loc[pbp['off_score'].isna(), 'off_score'] = pbp['posteam_score']
pbp.loc[pbp['def_score'].isna(), 'def_score'] = pbp['defteam_score']

In [None]:
(offense_proes, defense_proes, offense_rz_proes, defense_rz_proes) = make_proe_data(pbp)

In [None]:
"""
DESIGNED RUSH OUTCOMES

'fumble_lost' = 1
    'return_touchdown' = 1
'safety' = 1
'rush_touchdown' = 1

'yards_gained' (non-everything else)
"""
offensive_penalty = pbp['penalty_team'] == pbp['posteam']
defensive_penalty = pbp['penalty_team'] == pbp['defteam']

designed_runs = (
    (
        (pbp['play_type'] == 'run')
        & (pbp['qb_scramble'] == 0)
        & (pbp['qb_kneel'] == 0)
        & (pbp['two_point_attempt'] == 0)
    )
)

# designed run & sub-outcomes
pbp.loc[designed_runs, 'playcall'] = 'DESIGNED_RUN'

pbp['designed_run'] = ''
pbp.loc[designed_runs, 'designed_run'] = 'YARDS'
pbp.loc[designed_runs & (pbp['penalty'] == 1) & offensive_penalty, 'designed_run'] = 'OFFENSIVE_PENALTY'
pbp.loc[designed_runs & (pbp['penalty'] == 1) & defensive_penalty, 'designed_run'] = 'DEFENSIVE_PENALTY'
pbp.loc[designed_runs & (pbp['fumble_lost'] == 1), 'designed_run'] = 'FUMBLE_LOST'
pbp.loc[designed_runs & (pbp['rush_touchdown'] == 1), 'designed_run'] = 'RUSHING_TD'
pbp.loc[designed_runs & (pbp['safety'] == 1), 'designed_run'] = 'SAFETY'

In [None]:
pbp['home_timeout'] = 0
pbp['away_timeout'] = 0

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_offense'] == pbp['home_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_defense'] == pbp['home_team']).astype(int)

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_offense'] == pbp['away_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_defense'] == pbp['away_team']).astype(int)

pbp['home_timeout_prev'] = pbp['home_timeout'].shift(1)
pbp['away_timeout_prev'] = pbp['away_timeout'].shift(1)
pbp['duplicate_timeout'] = (
    ((pbp['home_timeout_prev'] == 1) & (pbp['home_timeout'] == 1))
    | ((pbp['away_timeout_prev'] == 1) & (pbp['away_timeout'] == 1))
)
pbp.loc[pbp['home_timeout_prev'] == 1, 'home_timeout'] = 0
pbp.loc[pbp['away_timeout_prev'] == 1, 'away_timeout'] = 0

pbp.loc[pbp['duplicate_timeout'], 'timeout'] = 'DUPLICATE'
pbp.loc[pbp['duplicate_timeout'], 'playcall'] = 'DUPLICATE_TIMEOUT'

pbp['total_home_timeouts'] = pbp.groupby(['game_id','home_team', 'game_half'])['home_timeout'].cumsum()
pbp['total_away_timeouts'] = pbp.groupby(['game_id','away_team', 'game_half'])['away_timeout'].cumsum()


pbp['off_timeouts_left'] = 3
pbp['def_timeouts_left'] = 3

pbp.loc[pbp['drive_offense'] == pbp['home_team'], 'off_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_offense'] == pbp['away_team'], 'off_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp.loc[pbp['drive_defense'] == pbp['home_team'], 'def_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_defense'] == pbp['away_team'], 'def_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp['off_timeouts_remaining'] = pbp['posteam_timeouts_remaining'].combine_first(pbp['off_timeouts_left']).astype(int).clip(0, 3)
pbp['def_timeouts_remaining'] = pbp['defteam_timeouts_remaining'].combine_first(pbp['def_timeouts_left']).astype(int).clip(0, 3)

pbp['down'] = pbp['down'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)
pbp['qtr'] = pbp['qtr'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)


In [None]:
pbp['timeout'] = 'PLAY'
for cat in ['NEUTRAL_TIMEOUT', 'DEFENSIVE_TIMEOUT', 'OFFENSIVE_TIMEOUT']:
    pbp.loc[pbp['playcall'] == cat, 'timeout'] = cat

In [None]:
add_clock_stops(pbp)

In [None]:
responses = [
    'timeout',
    'rushing_yards',
    'rush_touchdown',
    'return_touchdown',
    'playcall',
    'designed_run',
    'play_type',
    # other
    'extra_point_attempt',
    'two_point_attempt',
    'penalty_team',
    'rusher_id',
    'passer_id',
    'clock_runs_after',
    'safety',
    'fumble_lost',
    'location',
]

raw_features = [
    'down',
    'ydstogo',
    'goal_to_go',
    'qtr', 
    'clock_running',
    # 'quarter_seconds_remaining',
    'yardline_100',
    'off_score',
    'def_score',
    # 'play_clock',
    'game_seconds_remaining',
    'half_seconds_remaining',
    # 'wp', 
    'qb_scramble',
    'off_timeouts_remaining',
    'def_timeouts_remaining',
    # 'total', 'spread_line',
]

cols = responses + ['posteam', 'defteam', 'season', 'home_team'] + raw_features

In [None]:
off_penalties, def_penalties = make_penalty_zs(pbp)

In [None]:
dataset = (
    pd.get_dummies(pbp[cols], columns=['down', 'qtr', 'off_timeouts_remaining', 'def_timeouts_remaining'])
    .merge(offense_proes, how='left', on=['posteam','season'])
    .merge(defense_proes, how='left', on=['defteam','season'])
    .merge(offense_rz_proes, how='left', on=['posteam','season'])
    .merge(defense_rz_proes, how='left', on=['defteam','season'])
    .merge(off_penalties, how='left', on=['posteam','season'])
    .merge(def_penalties, how='left', on=['defteam','season'])   
)

dataset.loc[dataset['rusher_id'].isna(), 'rusher_id'] = dataset['passer_id']
add_playcall_features(dataset)

In [None]:
common_timeout_features = [
    'clock_running',
    'possdiff_per_minute',
    'fgpossdiff_per_minute',
]

In [None]:
is_down = (dataset['down_1'] + dataset['down_2'] + dataset['down_3'] + dataset['down_4']) == 1
runs_df = (
    dataset[(dataset['playcall'] == 'DESIGNED_RUN') | (dataset['qb_scramble'] == 1) & is_down & (dataset['play_type'] != 'no_play')]
    [['playcall','play_type','posteam','defteam','season','qb_scramble','rushing_yards','rusher_id','rush_touchdown','return_touchdown','clock_runs_after','safety', 'fumble_lost','ydstogo','yardline_100'] + state_features + common_timeout_features]
    .dropna()
)

runs_df['off_season'] = runs_df['posteam'] + '_' + runs_df['season'].astype(str)
runs_df['def_season'] = runs_df['defteam'] + '_' + runs_df['season'].astype(str)

In [None]:
def unpack_line(f):
    side, team, season = f.split('_')
    return {'side': side, 'team': team, 'season': int(season)}

In [None]:
all_line_coefs = []
for season in SEASONS:
    season_df = runs_df[(runs_df['season'] == season) & (runs_df['qb_scramble'] == 0)].reset_index(drop=True)
    offenses = pd.get_dummies(season_df['off_season'], prefix='off')
    # rushing yards is good for offense but bad for defense
    defenses = -1 * pd.get_dummies(season_df['def_season'], prefix='def')
    season_df[offenses.columns] = offenses
    season_df[defenses.columns] = defenses

    line_features = list(offenses.columns) + list(defenses.columns)
    line_models = Ridge(**POIS_KWARGS)
    line_models.fit(season_df[line_features], season_df['rushing_yards'])
    line_coefs = [
        {'coef': c, **unpack_line(f)}
        for f, c in zip(line_features, line_models.coef_)
    ]
    all_line_coefs.extend(line_coefs)
line_coefs_df = pd.DataFrame(all_line_coefs)
line_coefs_df['line_z'] = (line_coefs_df['coef'] - line_coefs_df['coef'].mean()) / line_coefs_df['coef'].std()

In [None]:
line_cols = ['team','season','line_z']
rushing_skills = (
    runs_df
    .merge(
        line_coefs_df[line_coefs_df['side'] == 'off'][line_cols].rename(columns={'line_z': 'ol_z'}),
        how='left',
        left_on=['season','posteam'],
        right_on=['season','team']
    ).merge(
        line_coefs_df[line_coefs_df['side'] == 'def'][line_cols].rename(columns={'line_z': 'dl_z'}),
        how='left',
        left_on=['season','defteam'],
        right_on=['season','team']
    )
    .dropna()
)

rushing_skills['ol_dl_z'] = rushing_skills['ol_z'] * rushing_skills['dl_z']

In [None]:
rushing_exp_features = [
    *state_features,
    *common_timeout_features,
    'ol_z',
    'dl_z',
    'ol_dl_z',
]

is_scramble = rushing_skills['qb_scramble'] == 1
designed_rush_df = rushing_skills[~is_scramble].reset_index(drop=True)
scramble_df = rushing_skills[is_scramble].reset_index(drop=True)


In [None]:
rushing_exp_model = Ridge(**POIS_KWARGS)
rushing_exp_model.fit(designed_rush_df[rushing_exp_features], designed_rush_df['rushing_yards'])
designed_rush_df['exp_rushing_yards'] = rushing_exp_model.predict(designed_rush_df[rushing_exp_features])
designed_rush_df['ryoe'] = designed_rush_df['rushing_yards'] - designed_rush_df['exp_rushing_yards']

In [None]:
scramble_exp_model = Ridge(**POIS_KWARGS)
scramble_exp_model.fit(scramble_df[rushing_exp_features], scramble_df['rushing_yards'])
scramble_df['exp_rushing_yards'] = scramble_exp_model.predict(scramble_df[rushing_exp_features])
scramble_df['syoe'] = scramble_df['rushing_yards'] - scramble_df['exp_rushing_yards']

In [None]:
def add_fake_data(df_: pd.DataFrame, col: str) -> pd.DataFrame:
    quantiles = [i/50 for i in range(1,50)]
    ryoe_quantiles = df_[col].quantile(q=quantiles)
    distinct_rushers = df_[['season','rusher_id']].drop_duplicates()
    fake_data = pd.DataFrame([
        {
            'season': dr['season'],
            'rusher_id': dr['rusher_id'],
            'yoe': ryoe, 
        }
        for dr in distinct_rushers.to_dict(orient='records')
        for ryoe in ryoe_quantiles
    ])
    ryoe_regressed = pd.concat([df_.rename(columns={col: 'yoe'})[['season','rusher_id','yoe']], fake_data])
    ryoe_regressed['count'] = 1
    ryoe_df = (
        ryoe_regressed
        .groupby(['season','rusher_id'], as_index=False)
        .aggregate({'yoe': ['mean', 'std'], 'count': 'count'})
        .sort_values(('yoe', 'mean'), ascending=False)
    )
    ryoe_df.columns = ['_'.join(col).strip() if col[0] == 'yoe' in col else col[0] for col in ryoe_df.columns.values]
    return ryoe_df

In [None]:
ryoe_df = add_fake_data(designed_rush_df, 'ryoe')
syoe_df = add_fake_data(scramble_df, 'syoe')

In [None]:
ryoe_df.merge(rusher_names, how='left', on='rusher_id').head(5)

In [None]:
syoe_df.merge(rusher_names, how='left', on='rusher_id').head(5)

In [None]:
def scramble_rate_season_weight(season: int) -> float:
    return math.exp(1.0 * (season - CURRENT_SEASON))

In [None]:
possible_scrambles = (dataset['play_type'] == 'pass') | (dataset['qb_scramble'] == 1)
scramble_rate_df = dataset[possible_scrambles].reset_index(drop=True)

PRIOR_DROPBACKS = 10
mean_scramble_rate = scramble_rate_df['qb_scramble'].mean()

scramble_rate_df.loc[scramble_rate_df['passer_id'].isna(), 'passer_id'] = scramble_rate_df['rusher_id']
scramble_rate_df['rusher_id'] = scramble_rate_df['passer_id']
scramble_rate_df['count'] = 1
scramble_rate_df.dropna(subset=['rusher_id'], inplace=True)

scramble_rates = (
    scramble_rate_df
    .groupby(['season','rusher_id'], as_index=False)
    .aggregate({'qb_scramble': 'mean', 'count': 'count'})
)
scramble_rates['regr_scramble_rate'] = (scramble_rates['count'] * scramble_rates['qb_scramble'] + PRIOR_DROPBACKS * mean_scramble_rate) / (scramble_rates['count'] + PRIOR_DROPBACKS)
scramble_rates.sort_values('regr_scramble_rate', ascending=False, inplace=True)

pred_rates = scramble_rates.merge(rusher_names, on='rusher_id', how='left')
pred_rates['season_weight'] = pred_rates['season'].apply(scramble_rate_season_weight)
pred_rates['weighted_samples'] = pred_rates['count'] * pred_rates['season_weight']
pred_rates['weighted_scrambles'] = pred_rates['regr_scramble_rate'] * pred_rates['weighted_samples']

scramble_rate_preds = (
    pred_rates
    .groupby(['rusher_id','name','position'], as_index=False)
    .aggregate({'weighted_samples': 'sum', 'weighted_scrambles': 'sum'})
)
scramble_rate_preds['scramble_rate'] = scramble_rate_preds['weighted_scrambles'] / scramble_rate_preds['weighted_samples']
scramble_rate_preds.sort_values('scramble_rate', ascending=False, inplace=True)
scramble_rate_preds[scramble_rate_preds['position'] == 'QB'].head(10)

In [None]:
run_result_features = [
    *rushing_exp_features,
    'log_mean_yards',
    'log_std_yards',
    'yoe_mean',
    'yoe_std',
    'togo_std',
    'yardline_std',
    # 'yoe_var',
    # 'togo_var',
    # 'yardline_var',
]
run_result_yards_features = [*run_result_features, 'clock_runs_after']

def make_run_result_df(df_: pd.DataFrame, yoe_df: pd.DataFrame, scrambling: bool) -> pd.DataFrame:
    res_df = (
        df_[df_['season'] <= CURRENT_SEASON]
        .merge(yoe_df[['season','rusher_id','yoe_mean','yoe_std']], how='left', on=['season','rusher_id'])
    )
    mean_yds = YARDS_PER_SCRAMBLE if scrambling else YARDS_PER_DESIGNED_RUN
    res_df['log_mean_yards'] = np.log( (mean_yds + res_df['yoe_mean']).clip(RUSHING_EPSILON) )
    res_df['log_std_yards'] = np.log(res_df['yoe_std'])

    # add interactions
    res_df['yardline_std'] = res_df['yardline_pct'] * res_df['yoe_std']
    res_df['togo_std'] = res_df['z_ydstogo'] * res_df['yoe_std']
    # res_df['yoe_var'] = res_df['yoe_std'] ** 2
    # res_df['yardline_var'] = res_df['yardline_pct'] * res_df['yoe_var']
    # res_df['togo_var'] = res_df['z_ydstogo'] * res_df['yoe_var']
    return res_df

In [None]:
def format_logr_model(model: LogisticRegression, name: str, scrambling: bool, include_clock: bool = True) -> str:
    ret = []
    prefix = 'scrambling' if scrambling else 'designed_run'
    ret.append(f'\n    pub fn {prefix}_{name}_coef() -> RushingModel {{')
    ret.append('        RushingModel {')
    ret.append(f'            intercept: {model.intercept_[0]:.4f},')
    fts = run_result_features if include_clock else run_result_yards_features
    for f, c in zip(fts, model.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    if include_clock:
        ret.append(f'            clock_runs_after: 0.0,')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_linr_model(model: PoissonRegressor, name: str, scrambling: bool) -> str:
    ret = []
    prefix = 'scrambling' if scrambling else 'designed_run'
    ret.append(f'\n    pub fn {prefix}_{name}_coef() -> RushingModel {{')
    ret.append('        RushingModel {')
    ret.append(f'            intercept: {model.intercept_:.4f},')
    for f, c in zip(run_result_yards_features, model.coef_):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)


In [None]:
def build_models(df_: pd.DataFrame, scrambling: bool) -> List[str]:
    suf = 'SCRAMBLING' if scrambling else 'DESIGNED_RUN'
    fl_df_ = df_[df_['fumble_lost'] == 1].copy()
    prob_fl_td_model = LogisticRegression(**LOGR_KWARGS)
    prob_fl_td_model.fit(fl_df_[run_result_features], fl_df_['return_touchdown'])
    safety_df = df_[(df_['fumble_lost'] == 0) & (df_['rush_touchdown'] == 0)].copy()
    ruyards_df = df_[(df_['fumble_lost'] == 0) & (df_['rush_touchdown'] == 0) & (df_['safety'] == 0)].copy()
    ruyards_df['pos_yards'] = (ruyards_df['rushing_yards'] > 0).astype(int)

    nonpos_yards_df = ruyards_df[ruyards_df['pos_yards'] == 0].copy()
    pos_yards_df = ruyards_df[ruyards_df['pos_yards'] == 1].copy()

    models: List[str] = []

    proj_fumble_lost_model = LogisticRegression(**LOGR_KWARGS)
    proj_fumble_lost_model.fit(df_[run_result_features], df_['fumble_lost'])
    models.append(format_logr_model(proj_fumble_lost_model, 'fumble_lost', scrambling))
    
    # fl_ins_prob = proj_fumble_lost_model.predict_proba(df_[run_result_features])[:,1]

    models.append(format_logr_model(prob_fl_td_model, 'prob_fl_td', scrambling))
    # flrtd_ins_prob = prob_fl_td_model.predict_proba(fl_df_[run_result_features])[:,1]

    proj_run_td_model = LogisticRegression(**LOGR_KWARGS)
    rtd_df = df_[df_['fumble_lost'] == 0].copy()
    proj_run_td_model.fit(rtd_df[run_result_features], rtd_df['rush_touchdown'])
    models.append(format_logr_model(proj_run_td_model, 'rush_td', scrambling))
    # rtd_ins_prob = proj_run_td_model.predict_proba(rtd_df[run_result_features])[:,1]

    # safeties can't happen on a scramble because it would be a sack
    if len(set(safety_df['safety'])) > 1:
        proj_safety_model = LogisticRegression(**LOGR_KWARGS)
        proj_safety_model.fit(safety_df[run_result_features], safety_df['safety'])
        models.append(format_logr_model(proj_safety_model, 'safety', scrambling))
        # safety_ins_prob = proj_safety_model.predict_proba(safety_df[run_result_features])[:,1]
        # safety_ins_prob.mean(), safety_ins_prob.std()

    clock_runs_model = LogisticRegression(**LOGR_KWARGS)
    clock_runs_model.fit(ruyards_df[run_result_features], ruyards_df['clock_runs_after'])
    # clock_runs_ins_prob = clock_runs_model.predict_proba(ruyards_df[run_result_features])[:,1]
    models.append(format_logr_model(clock_runs_model, 'clock_runs', scrambling))

    if not scrambling:
        ruyds_pos_model = LogisticRegression(**LOGR_KWARGS)
        ruyds_pos_model.fit(ruyards_df[run_result_yards_features], ruyards_df['pos_yards'])
        # ruyds_neg_ins_prob = ruyds_neg_model.predict_proba(ruyards_df[run_result_yards_features])[:,1]
        models.append(format_logr_model(ruyds_pos_model, 'is_yards_pos_sign', scrambling, include_clock=False))

    # nonneg_ruyds_bins = [i for i in range(int(nonneg_yards_df['rushing_yards'].min()), 1 + int(nonneg_yards_df['rushing_yards'].max()))]

    # nonneg_yards_df['rushing_yards'].std(), ryoe_df['yoe_std'].mean()

    posyards_model = PoissonRegressor(**POIS_KWARGS)
    posyards_model.fit(pos_yards_df[run_result_yards_features], pos_yards_df['rushing_yards'] - 1)
    # pos_yards_ins_proj = posyards_model.predict(pos_yards_df[run_result_yards_features])
    # pos_yards_resid = (pos_yards_ins_proj - np.log(pos_yards_df['rushing_yards'])).var()
    # print(f'const POS_{suf}_YARDS_RESID: f32 = {pos_yards_resid:.4f};')
    # pos_yards_ins_proj.mean(), pos_yards_ins_proj.std()
    models.append(format_linr_model(posyards_model, 'pos_yards', scrambling))

    posyards_mean_preds = posyards_model.predict(pos_yards_df[run_result_yards_features])

    posyards_var_model = PoissonRegressor(**POIS_KWARGS)
    posyards_var_model.fit(pos_yards_df[run_result_yards_features], (pos_yards_df['rushing_yards'] - 1 - posyards_mean_preds) ** 2)
    # pos_yards_var_ins_proj = posyards_var_model.predict(pos_yards_df[run_result_yards_features])
    # pos_yards_var_resid = (pos_yards_var_ins_proj - np.log((pos_yards_df['rushing_yards'] - posyards_mean_preds) ** 2)).var()
    # print(f'const POS_{suf}_YARDS_VAR_RESID: f32 = {pos_yards_var_resid:.4f};')
    models.append(format_linr_model(posyards_var_model, 'pos_yards_var', scrambling))

    if not scrambling:
        negyards_model = PoissonRegressor(**POIS_KWARGS)
        negyards_model.fit(nonpos_yards_df[run_result_yards_features], -1 * nonpos_yards_df['rushing_yards'])
        # neg_yards_ins_proj = negyards_model.predict(nonpos_yards_df[run_result_yards_features])
        # neg_yards_resid = (neg_yards_ins_proj + nonpos_yards_df['rushing_yards']).var()
        # print(f'const NEG_{suf}_YARDS_RESID: f32 = {neg_yards_resid:.4f};')
        models.append(format_linr_model(negyards_model, 'neg_yards', scrambling))
    
    return models

In [None]:
proj_scramble_result = make_run_result_df(scramble_df, syoe_df, scrambling=True)
proj_dr_result = make_run_result_df(designed_rush_df, ryoe_df, scrambling=False)

In [None]:
model_str_parts = ["use crate::models::rushing::RushingModel;\n", "impl RushingModel {"]
model_str_parts.extend(build_models(proj_scramble_result, True))
model_str_parts.extend(build_models(proj_dr_result, False))
model_str_parts.append("}")

model_str = "\n".join(model_str_parts)

with open(f'{MODELS_PATH}/rushing/coef.rs', 'w') as f:
    f.write(model_str)


In [None]:
# test whether negative binomial distribution is valid enough (it is)

# mu = np.mean(nonneg_yards_df['rushing_yards'])
# sigma_sqr = np.var(nonneg_yards_df['rushing_yards'])

# n = mu**2 / (sigma_sqr - mu)
# p = mu / sigma_sqr

# distribution = nbinom(n=n, p=p)

# values = pd.DataFrame([
#     {'x': x, 'y': distribution.pmf(x)}
#     for x in nonneg_ruyds_bins
# ])
# values['cdf'] = values['y'].cumsum()
# values['ev'] = values['x'] * values['y']
# print(values['ev'].sum())
# values.plot(x='x', y='y')

In [None]:
def mscar_season_weight(season: int) -> float:
    if season == CURRENT_SEASON:
        return 1.0
    if season == CURRENT_SEASON - 1:
        return 0.1
    return 0.0

In [None]:
SNEAK_YARDS = 1
proj_dr_result['ytg_1'] = ((proj_dr_result['yardline_100'] <= SNEAK_YARDS) | (proj_dr_result['ydstogo'] <= SNEAK_YARDS)).astype(int)
proj_dr_result['green_zone'] = ((proj_dr_result['yardline_100'] <= 10) & (proj_dr_result['ytg_1'] == 0)).astype(int)

In [None]:
team_carries = (
    proj_dr_result
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'playcall': 'count'})
    .rename(columns={'playcall': 'team_carries'})
)

ms_carries = (
    proj_dr_result
    .groupby(['season','posteam','rusher_id'], as_index=False)
    .aggregate({'playcall': 'count', 'green_zone': 'sum', 'ytg_1': 'sum'})
    .rename(columns={'playcall': 'carries'})
    .merge(team_carries, how='left', on=['season', 'posteam'])
)

prior_ytg_1 = ms_carries['ytg_1'].sum() / ms_carries['carries'].sum()
prior_gz = ms_carries['green_zone'].sum() / ms_carries['carries'].sum()

ms_carries['P(ytg1|car)'] = ms_carries['ytg_1'] / ms_carries['carries']
ms_carries['P(gz|car)'] = ms_carries['green_zone'] / ms_carries['carries']
ms_carries['P(normal}car)'] = 1 - ms_carries['P(gz|car)'] - ms_carries['P(ytg1|car)']
ms_carries['ms'] = ms_carries['carries'] / ms_carries['team_carries']

ms_carries['season_weight'] = ms_carries['season'].apply(mscar_season_weight)
ms_carries['carries_weight'] = ms_carries['season_weight'] * ms_carries['carries']
ms_carries['ytg1_weight'] = ms_carries['season_weight'] * ms_carries['ytg_1']
ms_carries['gz_weight'] = ms_carries['season_weight'] * ms_carries['green_zone']

ms_carries.sort_values(['ms', 'P(ytg1|car)','P(gz|car)'], ascending=False, inplace=True)

In [None]:
(f'{prior_ytg_1:.1%}', f'{prior_gz:.1%}', f'{1 - (prior_ytg_1 + prior_gz):.1%}')

In [None]:
# ms_carries.merge(rusher_names, how='left', on='rusher_id').head(1)

In [None]:
_ = """
P(_|carry) = 1 - P(ytg1|carry) - P(gz|carry)

P(carry|1ytg) = P(1ytg|rush) * P(rush) / P(1ytg)
P(carry|gz) = P(gz|rush) * P(rush) / P(gz)
P(carry|_) = P(_|rush) * P(rush) / P(_)
"""

In [None]:
weighted_carries = (
    ms_carries[ms_carries['carries_weight'] > 0]
    .groupby(['rusher_id'], as_index=False)
    .aggregate({
        'carries_weight': 'sum',
        'ytg1_weight': 'sum',
        'gz_weight': 'sum',
    })
    .merge(rosters[['rusher_id','team']], how='inner', on='rusher_id')
    .dropna()
)

team_weighted_carries = (
    weighted_carries
    .groupby(['team'], as_index=False)
    .aggregate({'carries_weight': 'sum'})
    .rename(columns={'carries_weight': 'team_carries_weight'})
)

proj_ms_carries = weighted_carries.merge(team_weighted_carries, how='left', on='team')

proj_ms_carries['prob_1ytg_given_carry'] = proj_ms_carries['ytg1_weight'] / proj_ms_carries['carries_weight']
proj_ms_carries['prob_gz_given_carry'] = proj_ms_carries['gz_weight'] / proj_ms_carries['carries_weight']
proj_ms_carries['ms_carries'] = proj_ms_carries['carries_weight'] / proj_ms_carries['team_carries_weight']

proj_ms_carries.sort_values(['ms_carries'], ascending=False, inplace=True)

In [None]:
write_ms_carries = (
    proj_ms_carries
    .merge(rusher_names, how='left', on='rusher_id')
    .drop(columns=['carries_weight','ytg1_weight','gz_weight','team_carries_weight'])
    .rename(columns={'posteam': 'team'})
    [['team','rusher_id','name','position','ms_carries', 'prob_1ytg_given_carry', 'prob_gz_given_carry']]
    .sort_values('ms_carries', ascending=False)
    .reset_index(drop=True)
)

write_ms_carries['ms_carries'] = write_ms_carries['ms_carries'].round(decimals=3)
write_ms_carries['prob_1ytg_given_carry'] = write_ms_carries['prob_1ytg_given_carry'].round(decimals=3)
write_ms_carries['prob_gz_given_carry'] = write_ms_carries['prob_gz_given_carry'].round(decimals=3)

In [None]:
def yoe_season_weight(season: int) -> float:
    return math.exp(0.5 * (season - CURRENT_SEASON))

In [None]:
def make_yoe_proj(df_: pd.DataFrame):
    yoe_proj = df_.copy()
    yoe_proj['season_weight'] = yoe_proj['season'].apply(yoe_season_weight)
    yoe_proj['weighted_samples'] = yoe_proj['count'] * yoe_proj['season_weight']
    yoe_proj['weighted_mean'] = yoe_proj['yoe_mean'] * yoe_proj['weighted_samples']
    yoe_proj['weighted_var'] = yoe_proj['weighted_samples'] * yoe_proj['yoe_std'] ** 2
    projs = (
        yoe_proj
        .groupby(['rusher_id'], as_index=False)
        .aggregate({
            'weighted_mean': 'sum',
            'weighted_var': 'sum',
            'weighted_samples': 'sum',
        })
    )
    projs['yoe_mean'] = (projs['weighted_mean'] / projs['weighted_samples']).round(decimals=2)
    projs['yoe_std'] = np.sqrt((projs['weighted_var'] / projs['weighted_samples'])).round(decimals=1)
    projs['yoe_samples'] = projs['weighted_samples'].round(decimals=1)
    return projs[['rusher_id','yoe_mean','yoe_std', 'yoe_samples']].sort_values('yoe_mean', ascending=False)

In [None]:
ryoe_proj = make_yoe_proj(ryoe_df)
write_carry_projections = (
    write_ms_carries
    .merge(ryoe_proj, how='left', on='rusher_id')
    .rename(columns={'rusher_id': 'player_id', 'yoe_mean': 'ryoe', 'yoe_std': 'ryoe_std'})
    .drop(columns=['yoe_samples'])
)
write_carry_projections.head(1)

In [None]:
syoe_proj = make_yoe_proj(syoe_df)
write_scramble_proj = (
    scramble_rate_preds
    .merge(syoe_proj, on='rusher_id', how='left')
    .merge(rosters[rosters['position'] == 'QB'][['rusher_id','team']], how='inner')
    .rename(columns={'rusher_id': 'player_id', 'yoe_mean': 'syoe', 'yoe_std': 'syoe_std'})
    [['team','player_id','name','position','scramble_rate','syoe','syoe_std']]
)
write_scramble_proj['scramble_rate'] = write_scramble_proj['scramble_rate'].round(decimals=3)
write_scramble_proj.head(1)

In [None]:
write_carry_projections.to_csv(f'{BASELINES_PATH}/carries.csv', index=False)
write_scramble_proj.to_csv(f'{BASELINES_PATH}/scrambling.csv', index=False)

In [None]:
game_counts = (
    pbp[pbp['season'] >= CURRENT_SEASON - 1]
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'game_id': 'nunique'})
    .rename(columns={'posteam': 'team', 'game_id': 'n_games'}).reset_index(drop=True)
)

In [None]:
team_projections = (
    game_counts
    .merge(
        right=line_coefs_df[line_coefs_df['side'] == 'off'][line_cols].rename(columns={'line_z': 'oline_rushing_z'}),
        how='left',
        on=['season','team']
    )
    .merge(
        line_coefs_df[line_coefs_df['side'] == 'def'][line_cols].rename(columns={'line_z': 'dline_rushing_z'}),
        how='left',
        on=['season','team'],
    )
    .merge(off_penalties.rename(columns={'posteam': 'team'}), on=['season','team'])
    .merge(def_penalties.rename(columns={'defteam': 'team'}), on=['season','team'])
)

team_projections['weight'] = team_projections['n_games'] * team_projections['season'].apply(yoe_season_weight)
tp_cols = ['oline_rushing_z', 'dline_rushing_z']
for c in tp_cols:
    team_projections[c] *= team_projections['weight']


write_projections = (
    team_projections
    .groupby(['team'], as_index=False)
    .aggregate({c: 'sum' for c in tp_cols + ['weight']})
)
for c in tp_cols:
    write_projections[c] = (write_projections[c] / write_projections['weight']).round(decimals=2)
write_projections.drop(columns=['weight'], inplace=True)

write_projections.to_csv(f'{BASELINES_PATH}/team_rushing.csv', index=False)
