In [None]:
import math
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression, PoissonRegressor, Ridge
import nfl_data_py

from pbp.data import (
    CURRENT_SEASON,
    BASELINES_PATH,
    MODELS_PATH,
    LOGR_KWARGS,
    POIS_KWARGS,
    PROB_COMPLETION,
    PROB_INTERCEPTION,
    common_timeout_features,
    state_features,
    add_playcall_features,
    add_clock_stops,
    load_pbp_data,
    make_proe_data,
)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
SEASONS = list(range(2017, CURRENT_SEASON + 1))

_EPSILON = 1e-6

MEAN_AIR_YARDS = 8.0
EPSILON_AIR_YARDS = 0.01

In [None]:
OFFENSIVE_POSITIONS = {'QB','RB','FB','WR','TE'}
this_season_rosters = nfl_data_py.import_seasonal_rosters(years=[CURRENT_SEASON])
rosters = (
    this_season_rosters[this_season_rosters['position'].isin(OFFENSIVE_POSITIONS)]
)

players = nfl_data_py.import_players()
player_names = players[['gsis_id','display_name','position']].rename(columns={'gsis_id': 'player_id', 'display_name': 'name'})

In [None]:
try:
    pbp.head(1)
except:
    pbp_copy = load_pbp_data(SEASONS, force_reload=True)

In [None]:
pbp = pbp_copy.copy()

drive_keys = ['game_id','drive','posteam','defteam', 'posteam_score', 'defteam_score']
drive_teams = (
    pbp[~pbp['posteam'].isna() & ~pbp['defteam'].isna() & ~pbp['posteam_score'].isna() & ~pbp['defteam_score'].isna()]
    [drive_keys]
    .groupby(drive_keys, as_index=False)
    .nth(0)
    .rename(columns={'posteam': 'drive_offense', 'defteam': 'drive_defense', 'posteam_score': 'off_score', 'defteam_score': 'def_score'})
)
pbp = pbp.merge(drive_teams, on=['game_id', 'drive'], how='left').drop_duplicates(subset=['play_id','game_id'])

pbp.loc[pbp['off_score'].isna(), 'off_score'] = pbp['posteam_score']
pbp.loc[pbp['def_score'].isna(), 'def_score'] = pbp['defteam_score']

In [None]:
(offense_proes, defense_proes, offense_rz_proes, defense_rz_proes) = make_proe_data(pbp)

In [None]:
dropback = ((pbp['play_type'] == 'pass') | (pbp['qb_scramble'] == 1)) & (pbp['two_point_attempt'] != 1)

In [None]:
add_clock_stops(pbp)

In [None]:
pbp['home_timeout'] = 0
pbp['away_timeout'] = 0

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_offense'] == pbp['home_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'home_timeout'] = (pbp['drive_defense'] == pbp['home_team']).astype(int)

pbp.loc[pbp['timeout'] == 'OFFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_offense'] == pbp['away_team']).astype(int)
pbp.loc[pbp['timeout'] == 'DEFENSIVE_TIMEOUT', 'away_timeout'] = (pbp['drive_defense'] == pbp['away_team']).astype(int)

pbp['home_timeout_prev'] = pbp['home_timeout'].shift(1)
pbp['away_timeout_prev'] = pbp['away_timeout'].shift(1)
pbp['duplicate_timeout'] = (
    ((pbp['home_timeout_prev'] == 1) & (pbp['home_timeout'] == 1))
    | ((pbp['away_timeout_prev'] == 1) & (pbp['away_timeout'] == 1))
)
pbp.loc[pbp['home_timeout_prev'] == 1, 'home_timeout'] = 0
pbp.loc[pbp['away_timeout_prev'] == 1, 'away_timeout'] = 0

pbp.loc[pbp['duplicate_timeout'], 'timeout'] = 'DUPLICATE'
pbp.loc[pbp['duplicate_timeout'], 'playcall'] = 'DUPLICATE_TIMEOUT'

pbp['total_home_timeouts'] = pbp.groupby(['game_id','home_team', 'game_half'])['home_timeout'].cumsum()
pbp['total_away_timeouts'] = pbp.groupby(['game_id','away_team', 'game_half'])['away_timeout'].cumsum()

# ugh, probably fine. if the teams are null, then it's probably first play
pbp['off_timeouts_left'] = 3
pbp['def_timeouts_left'] = 3

pbp.loc[pbp['drive_offense'] == pbp['home_team'], 'off_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_offense'] == pbp['away_team'], 'off_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp.loc[pbp['drive_defense'] == pbp['home_team'], 'def_timeouts_left'] = 3 - pbp['total_home_timeouts']
pbp.loc[pbp['drive_defense'] == pbp['away_team'], 'def_timeouts_left'] = 3 - pbp['total_away_timeouts']

pbp['off_timeouts_remaining'] = pbp['posteam_timeouts_remaining'].combine_first(pbp['off_timeouts_left']).astype(int).clip(0, 3)
pbp['def_timeouts_remaining'] = pbp['defteam_timeouts_remaining'].combine_first(pbp['def_timeouts_left']).astype(int).clip(0, 3)

pbp['down'] = pbp['down'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)
pbp['qtr'] = pbp['qtr'].apply(lambda x: str(int(x)) if not math.isnan(x) else None)

In [None]:
line_strength_df = pbp[dropback][['season','posteam','defteam','qb_hit','sack','number_of_pass_rushers']].copy()
line_strength_df.loc[line_strength_df['number_of_pass_rushers'].isna(), 'number_of_pass_rushers'] = 4.3
line_strength_df.dropna(inplace=True)
line_strength_df.reset_index(drop=True, inplace=True)

In [None]:
def get_line_features(season: int, response: str):
    season_ls = line_strength_df[line_strength_df['season'] == season].reset_index(drop=True)
    # more likely to be sacked is bad for offense, good for defense
    off_dummies = -1 * pd.get_dummies(season_ls['posteam'], prefix='off')
    def_dummies = pd.get_dummies(season_ls['defteam'], prefix='def')
    season_ls[off_dummies.columns] = off_dummies
    season_ls[def_dummies.columns] = def_dummies

    features = ['number_of_pass_rushers', *list(off_dummies.columns), *list(def_dummies.columns)]

    qbh_model = LogisticRegression(**LOGR_KWARGS)
    qbh_model.fit(season_ls[features], season_ls[response])

    return [
        {
            'season': season,
            'side': f.split('_')[0],
            'team': f.split('_')[1],
            f'{response}_coef': c,
        }
        for f, c in zip(features[1:], qbh_model.coef_[0][1:])
    ]

In [None]:
qb_sack_df = pbp[dropback][['season','passer_id','qb_hit','sack']].copy()
qb_sack_df.loc[qb_sack_df['sack'] == 1, 'qb_hit'] = 1

qb_sack_avoidance = (
    qb_sack_df
    .dropna()
    .groupby(['season','passer_id'], as_index=False)
    .aggregate({'qb_hit': 'sum', 'sack': 'sum'})
    .rename(columns={'qb_hit': 'qb_hit_count', 'sack': 'sack_count'})
)

PRIOR_HITS = 10
prior_prob = qb_sack_df['sack'].sum() / qb_sack_df['qb_hit'].sum()

qb_sack_avoidance['qb_prob_sack_given_hit'] = (
    (qb_sack_avoidance['sack_count'] + PRIOR_HITS * prior_prob)
    / (qb_sack_avoidance['qb_hit_count'] + PRIOR_HITS)
)
qb_sack_avoidance.drop(columns=['qb_hit_count', 'sack_count'], inplace=True)

In [None]:
sack_features = []
for season in SEASONS:
    sack_features.extend(get_line_features(season, 'sack'))

qbhit_features = []
for season in SEASONS:
    qbhit_features.extend(get_line_features(season, 'qb_hit'))

In [None]:
line_features = pd.DataFrame(qbhit_features).merge(pd.DataFrame(sack_features), on=['season','side','team'])
line_features['line_strength'] = 0.5 * line_features['qb_hit_coef'] + 0.5 * line_features['sack_coef']
line_features['z'] = (line_features['line_strength'] - line_features['line_strength'].mean()) / line_features['line_strength'].std()

joined_line_features = (
    line_features[line_features['side'] == 'off'][['season','team','z']].rename(columns={'z': 'offense_pass_rush_z'})
    .merge(
        right=line_features[line_features['side'] == 'def'][['season','team','z']].rename(columns={'z': 'defense_pass_rush_z'}),
        on=['season','team'],
        how='inner',
    )
)

In [None]:

pbp['dropback'] = ''
pbp.loc[dropback, 'dropback'] = 'UNLABELED'
pbp.loc[dropback & (pbp['pass_attempt'] == 1) & ~pbp['receiver_player_id'].isna(), 'dropback'] = 'TARGETED_RECEIVER'

pbp.loc[pbp['qb_scramble'] == 1, 'dropback'] = 'QB_SCRAMBLE'
pbp.loc[dropback & (pbp['pass_attempt'] == 1) & pbp['receiver_player_id'].isna(), 'dropback'] = 'THROWAWAY'


pbp.loc[dropback & (pbp['sack'] == 1), 'dropback'] = 'SACK'

pbp['sack_result'] = ''
pbp.loc[pbp['sack'] == 1, 'sack_result'] = 'LOSS_OF_YARDS'
pbp.loc[(pbp['sack'] == 1) & (pbp['safety'] == 1), 'sack_result'] = 'SAFETY'
pbp.loc[(pbp['sack'] == 1) & (pbp['fumble_lost'] == 1) & (pbp['return_touchdown'] == 0), 'sack_result'] = 'FUMBLE_LOST'
pbp.loc[(pbp['sack'] == 1) & (pbp['fumble_lost'] == 1) & (pbp['return_touchdown'] == 1), 'sack_result'] = 'FUMBLE_LOST_TD'

offensive_penalty = pbp['penalty_team'] == pbp['posteam']
defensive_penalty = pbp['penalty_team'] == pbp['defteam']

# might mess with targeted if both occurred
pbp.loc[dropback & offensive_penalty & (pbp['penalty'] == 1) & (pbp['complete_pass'] == 0), 'dropback'] = 'OFFENSIVE_PENALTY'
pbp.loc[dropback & defensive_penalty & (pbp['penalty'] == 1) & (pbp['complete_pass'] == 0), 'dropback'] = 'DEFENSIVE_PENALTY'

pbp['target_result'] = ''
target = pbp['dropback'] == 'TARGETED_RECEIVER'
pbp.loc[target, 'target_result'] = 'UNLABELED'
pbp.loc[target & (pbp['pass_touchdown'] == 1), 'target_result'] = 'TD'
pbp.loc[target & (pbp['interception'] == 1) & (pbp['return_touchdown'] == 0), 'target_result'] = 'INT'
pbp.loc[target & (pbp['interception'] == 1) & (pbp['return_touchdown'] == 1), 'target_result'] = 'PICK_SIX'
pbp.loc[target & (pbp['incomplete_pass'] == 1) & (pbp['fumble_lost'] == 0), 'target_result'] = 'INCOMPLETE'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 0), 'target_result'] = 'CATCH'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 0) & (pbp['penalty'] == 1) & defensive_penalty, 'target_result'] = 'CATCH_AND_PENALTY'
pbp.loc[target & (pbp['complete_pass'] == 1) & (pbp['fumble_lost'] == 1), 'target_result'] = 'CATCH_AND_FUMBLE'

pbp.loc[(pbp['qb_scramble'] == 1) & pbp['passer_id'].isna(), 'passer_id'] = pbp['rusher_id']

In [None]:
scramble_rate_df = pbp[dropback][['season','passer_id','qb_scramble']].dropna().reset_index(drop=True)
scramble_rate_df['count'] = 1

PRIOR_DROPBACKS = 10
mean_scramble_rate = scramble_rate_df['qb_scramble'].mean()

scramble_rates = (
    scramble_rate_df[['season','passer_id','qb_scramble','count']]
    .groupby(['season','passer_id'], as_index=False)
    .aggregate({'qb_scramble': 'sum', 'count': 'count'})
)
scramble_rates['scramble_rate'] = (
    (scramble_rates['qb_scramble'] + mean_scramble_rate * PRIOR_DROPBACKS)
    / (scramble_rates['count'] + PRIOR_DROPBACKS)
)
scramble_rates.drop(columns=['qb_scramble','count'], inplace=True)

In [None]:
responses = [
    'play_id',
    'game_id',
    'posteam',
    'defteam',
    'season',
    'home_team',
    'passer_id',
    'dropback',
    'sack',
    'target_result',
    'sack_result',
    'qb_scramble',
    'yards_gained',
    'clock_runs_after',
    'location',
]

raw_features = [
    'down',
    'ydstogo',
    'goal_to_go',
    'qtr', 
    'clock_running',
    # 'quarter_seconds_remaining',
    'yardline_100',
    'off_score',
    'def_score',
    # 'play_clock',
    'game_seconds_remaining',
    'half_seconds_remaining',
    # 'wp', 
    'off_timeouts_remaining',
    'def_timeouts_remaining',
    # 'total', 'spread_line',
]

cols = responses + raw_features

In [None]:
nondummy_completion_features = [
    # including P(completion) is probably wrong bc correlates with what we want to extract
    # 'log_prob_completion',
    # is air yards factored into 'cp'? probably, but it's unclear and can't hurt
    'air_yards_clipped',
    'neg_log_air_yards',
    'pos_log_air_yards',
    'pos_log_air_yards_sq',
    # 'log_air_yards',
    # 'air_yards',
    'offense_pass_rush_z',
    'defense_pass_rush_z',
    'off_def_pass_rush_z',
]

def get_completion_features(df_: pd.DataFrame, season: int):
    season_target_df = df_[df_['season'] == season].reset_index(drop=True)

    off_dummies = pd.get_dummies(season_target_df['posteam'], prefix='off')
    def_dummies = pd.get_dummies(season_target_df['defteam'], prefix='def')

    season_target_df[off_dummies.columns] = off_dummies
    season_target_df[def_dummies.columns] = def_dummies

    features = [
        *nondummy_completion_features,
        *sorted(off_dummies.columns),
        *sorted(def_dummies.columns),
    ]

    comp_model = LogisticRegression(**LOGR_KWARGS)
    comp_model.fit(season_target_df[features], season_target_df['complete_pass'])

    int_model = LogisticRegression(**LOGR_KWARGS)
    int_model.fit(season_target_df[features], season_target_df['interception'])

    tmloc = len(nondummy_completion_features)
    season_records = []
    for (c, comp_f, int_f) in zip(features[tmloc:], comp_model.coef_[0, tmloc:], int_model.coef_[0, tmloc:]):
        side, team = c.split('_')
        season_records.append({
            'season': season,
            'side': side,
            'team': team,
            # completions are good for offense and bad for defense. vice versa for INTs
            'comp_coef': (-1 if side == 'def' else 1) * comp_f,
            'int_coef': (-1 if side == 'off' else 1) * int_f,
        })

    return season_records

In [None]:
# TODO: remove this and project them too
has_penalty = pbp['dropback'].isin({'OFFENSIVE_PENALTY','DEFENSIVE_PENALTY'})

dropback_dataset = (
    pd.get_dummies(pbp[(~has_penalty) & (~pbp['passer_id'].isna()) & dropback][cols], columns=['down', 'qtr', 'off_timeouts_remaining', 'def_timeouts_remaining'])
    .merge(scramble_rates, on=['season','passer_id'], how='left').rename(columns={'scramble_rate': 'qb_scramble_rate'})
    .merge(qb_sack_avoidance, on=['season','passer_id'], how='left')
    .merge(offense_proes, on=['season','posteam'], how='left')
    .merge(defense_proes, on=['season','defteam'], how='left')
    .merge(offense_rz_proes, how='left', on=['posteam','season'])
    .merge(defense_rz_proes, how='left', on=['defteam','season'])
    .merge(joined_line_features[['season','team','offense_pass_rush_z']].rename(columns={'team': 'posteam'}), on=['season','posteam'], how='left')
    .merge(joined_line_features[['season','team','defense_pass_rush_z']].rename(columns={'team': 'defteam'}), on=['season','defteam'], how='left')
    .dropna()
)

add_playcall_features(dropback_dataset)

dropback_dataset['log_qb_scramble'] = np.log(dropback_dataset['qb_scramble_rate'] + _EPSILON)
dropback_dataset['log_qbps'] = np.log(dropback_dataset['qb_prob_sack_given_hit'] + _EPSILON)
dropback_dataset["off_def_pass_rush_z"] = dropback_dataset["offense_pass_rush_z"] * dropback_dataset["defense_pass_rush_z"]
dropback_dataset["olpz_qbps"] = (
    dropback_dataset["offense_pass_rush_z"] * dropback_dataset["log_qbps"]
)
dropback_dataset["dlpz_qbps"] = (
    dropback_dataset["defense_pass_rush_z"] * dropback_dataset["log_qbps"]
)
dropback_dataset["olpz_scramble"] = (
    dropback_dataset["offense_pass_rush_z"] * dropback_dataset["log_qb_scramble"]
)
dropback_dataset["dlpz_scramble"] = (
    dropback_dataset["defense_pass_rush_z"] * dropback_dataset["log_qb_scramble"]
)

dropback_dataset['off_def_pass_rush_z'] = dropback_dataset['offense_pass_rush_z'] * dropback_dataset['defense_pass_rush_z']

In [None]:
def apply_ay_transforms(df_: pd.DataFrame):
    df_['air_yards_clipped'] = df_['air_yards'].clip(-5, 50)
    df_['neg_log_air_yards'] = np.log((-1 * df_['air_yards_clipped']).clip(1))
    df_['pos_log_air_yards'] = np.log(df_['air_yards_clipped'].clip(1))
    df_['pos_log_air_yards_sq'] = df_['pos_log_air_yards'] ** 2

In [None]:
target_df = (
    dropback_dataset[dropback_dataset['dropback'] == 'TARGETED_RECEIVER']
    .merge(pbp[['play_id','game_id','air_yards','cp','complete_pass','interception','receiver_id','return_yards','touchdown']], how='left', on=['play_id','game_id'])
    .fillna({'return_yards': 0, 'touchdown': 0})
    .dropna()
)
apply_ay_transforms(target_df)

In [None]:
# ay_cp_df = target_df.groupby(['air_yards_clipped'], as_index=False).aggregate({'cp': 'mean'}).rename(columns={'cp': 'ay_cp'})

# prob_cp_features = ['air_yards_clipped', 'neg_log_air_yards', 'pos_log_air_yards', 'pos_log_air_yards_sq']
# prob_cp_model = LogisticRegression(**LOGR_KWARGS)
# prob_cp_model.fit(target_df[prob_cp_features], target_df['complete_pass'])

# to_pred = pd.DataFrame([{'air_yards': ay} for ay in range(-5, 51)])
# apply_ay_transforms(to_pred)
# to_pred['prob'] = prob_cp_model.predict_proba(to_pred[prob_cp_features])[:,1]

# ay_cp_df.plot(x='air_yards_clipped', y='ay_cp', kind='scatter')
# to_pred.plot(x='air_yards_clipped', y='prob', kind='scatter')

In [None]:
all_season_features = []
for season in SEASONS:
    all_season_features.extend(get_completion_features(target_df, season))

completion_features = pd.DataFrame(all_season_features)
completion_features['comp_z'] = (completion_features['comp_coef'] - completion_features['comp_coef'].mean()) / completion_features['comp_coef'].std()
completion_features['int_z'] = (completion_features['int_coef'] - completion_features['int_coef'].mean()) / completion_features['int_coef'].std()
completion_features.drop(columns=['comp_coef','int_coef'], inplace=True)

team_completion_features = (
    (
        completion_features[completion_features['side'] == 'off']
        .rename(columns={'comp_z': 'off_comp_z', 'int_z': 'off_int_z'})
        .drop(columns=['side'])
    )
    .merge(
        right=(
            completion_features[completion_features['side'] == 'def']
            .rename(columns={'comp_z': 'def_comp_z', 'int_z': 'def_int_z'})
            .drop(columns=['side'])
        ),
        how='inner',
        on=['season','team']
    )
)

In [None]:
# target_df.groupby(['air_yards'], as_index=False).aggregate({'cp': 'mean'}).plot(x='air_yards', y='cp', kind='scatter')

In [None]:
pass_skill_df = (
    target_df
    .merge(
        right=team_completion_features[['season','team','off_comp_z','off_int_z']],
        how='left',
        left_on=['season','posteam'],
        right_on=['season','team']
    ).merge(
        right=team_completion_features[['season','team','def_comp_z','def_int_z']],
        how='left',
        left_on=['season','defteam'],
        right_on=['season','team'],
    ).rename(columns={
        'off_comp_z': 'offense_completion_z',
        'off_int_z': 'offense_interception_z',
        'def_comp_z': 'defense_completion_z',
        'def_int_z': 'defense_interception_z',
    })
)

pass_skill_df['def_comp_scramble'] = pass_skill_df['defense_pass_rush_z'] * pass_skill_df['log_qb_scramble']
pass_skill_df['def_int_scramble'] = pass_skill_df['defense_pass_rush_z'] * pass_skill_df['log_qb_scramble']

In [None]:
# team_completion_features[team_completion_features['season'] == CURRENT_SEASON]

In [None]:
dropback_specific_features = [
    "qb_scramble_rate",
    "log_qb_scramble",
    "qb_prob_sack_given_hit",
    "log_qbps",
    "offense_pass_rush_z",
    "defense_pass_rush_z",
    "off_def_pass_rush_z",
    "defense_completion_z",
    'defense_interception_z',
    'def_comp_scramble',
    'def_int_scramble',
    "olpz_qbps",
    "dlpz_qbps",
    "olpz_scramble",
    "dlpz_scramble",
]
dropback_features = [*common_timeout_features, *state_features, *dropback_specific_features]

In [None]:
def_z_features = ['defense_completion_z','defense_interception_z']
ps_exp_features = [*nondummy_completion_features, *def_z_features]
qb_compl_model = LogisticRegression(**LOGR_KWARGS)
qb_compl_model.fit(pass_skill_df[ps_exp_features], pass_skill_df['complete_pass'])
pass_skill_df['prob_completion'] = qb_compl_model.predict_proba(pass_skill_df[ps_exp_features])[:,1]

qb_int_model = LogisticRegression(**LOGR_KWARGS)
qb_int_model.fit(pass_skill_df[ps_exp_features], pass_skill_df['interception'])
pass_skill_df['prob_interception'] = qb_int_model.predict_proba(pass_skill_df[ps_exp_features])[:,1]

qb_airyards_model = Ridge(**POIS_KWARGS)
qb_airyards_model.fit(pass_skill_df[dropback_features + def_z_features], pass_skill_df['air_yards'])
pass_skill_df['exp_air_yards'] = qb_airyards_model.predict(pass_skill_df[dropback_features + def_z_features])

In [None]:
qb_pass_skill_df = pass_skill_df.copy()
qb_pass_skill_df['comp_oe'] = qb_pass_skill_df['complete_pass'] - qb_pass_skill_df['prob_completion']
qb_pass_skill_df['int_ue'] = -1 * (qb_pass_skill_df['interception'] - qb_pass_skill_df['prob_interception'])
qb_pass_skill_df['ay_oe'] = qb_pass_skill_df['air_yards'] - qb_pass_skill_df['exp_air_yards']
qb_pass_skill_df['ay_std'] = qb_pass_skill_df['ay_oe']  # hack to aggregate cleaner

avg_ayoe_std = qb_pass_skill_df['ay_oe'].std()
avg_adot = qb_pass_skill_df['air_yards'].mean()

In [None]:
PRIOR_ATT_COUNT = 50

qb_features_df = (
    qb_pass_skill_df
    .groupby(['season','passer_id'], as_index=False)
    .aggregate({
        'play_id': 'count',
        'comp_oe': 'mean',
        'int_ue': 'mean',
        'ay_oe': 'mean',
        'ay_std': 'std',
    })
    .rename(columns={'play_id': 'count'})
)
qb_features_df.loc[qb_features_df['ay_std'].isna(), 'ay_std'] = avg_ayoe_std
qb_features_df['comp_oe'] = qb_features_df['comp_oe'] * qb_features_df['count'] / (qb_features_df['count'] + PRIOR_ATT_COUNT)
qb_features_df['int_ue'] = qb_features_df['int_ue'] * qb_features_df['count'] / (qb_features_df['count'] + PRIOR_ATT_COUNT)
qb_features_df['ay_oe'] = qb_features_df['ay_oe'] * qb_features_df['count'] / (qb_features_df['count'] + PRIOR_ATT_COUNT)
# add variances properly
qb_features_df['ay_std'] = np.sqrt((qb_features_df['count'] * qb_features_df['ay_std'] ** 2 + PRIOR_ATT_COUNT * avg_ayoe_std ** 2) / (qb_features_df['count'] + PRIOR_ATT_COUNT))

qb_features_df['log_qb_mean_ay'] = np.log((MEAN_AIR_YARDS + qb_features_df['ay_oe']).clip(EPSILON_AIR_YARDS))
qb_features_df['log_qb_std_ay'] = np.log(qb_features_df['ay_std'])
qb_features_df['log_qb_comp_prob'] = np.log(PROB_COMPLETION + qb_features_df['comp_oe'])
qb_features_df['log_qb_int_prob'] = np.log( (PROB_INTERCEPTION - qb_features_df['int_ue']).clip(_EPSILON) )


In [None]:
# qb_features_df[qb_features_df['season'] == CURRENT_SEASON].merge(player_names, left_on=['passer_id'], right_on=['player_id']).sort_values('count', ascending=False)

In [None]:
qb_skill_features = [
    'log_qb_comp_prob',
    'log_qb_int_prob',
    'qb_ay_oe',
    'log_qb_mean_ay',
    'qb_ay_std',
    'log_qb_std_ay',
    'qb_def_comp',
    'qb_def_int',
]
dropback_qb_features = [*dropback_features, *qb_skill_features]

dataset = (
    dropback_dataset
    .merge(
        right=team_completion_features[['season','team','def_comp_z','def_int_z']],
        how='left',
        left_on=['season','defteam'],
        right_on=['season','team'],
    )
    .merge(
        right=qb_features_df,
        how='left',
        on=['season','passer_id'],
    )
    .rename(columns={
        'def_comp_z': 'defense_completion_z',
        'def_int_z': 'defense_interception_z',
        'comp_oe': 'qb_comp_oe',
        'int_ue': 'qb_int_ue',
        'ay_oe': 'qb_ay_oe',
        'ay_std': 'qb_ay_std',
    })
    .dropna()
)

dataset['qb_def_comp'] = dataset['log_qb_comp_prob'] + dataset['defense_completion_z']
dataset['qb_def_int'] = dataset['log_qb_int_prob'] + dataset['defense_interception_z']
dataset['def_comp_scramble'] = dataset['defense_pass_rush_z'] * dataset['log_qb_scramble']
dataset['def_int_scramble'] = dataset['defense_pass_rush_z'] * dataset['log_qb_scramble']

In [None]:
def format_logr_model(model: LogisticRegression, name: str) -> str:
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> DropbackModel {{')
    ret.append('        DropbackModel {')
    ret.append(f'            intercept: {model.intercept_[0]:.4f},')
    for f, c in zip(dropback_features, model.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_linr_model(model: PoissonRegressor, name: str):
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> DropbackModel {{')
    ret.append('        DropbackModel {')
    ret.append(f'            intercept: {model.intercept_:.4f},')
    for f, c in zip(dropback_features, model.coef_):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)


In [None]:
# print('pub struct DropbackModel {')
# print('    pub intercept: f32,')
# for f in dropback_features:
#     print(f'    pub {f}: f32,')
# print('}')

In [None]:
# NOTE: clock always runs after a sack
sack_outcome_df = dataset[dataset['sack'] == 1].reset_index(drop=True)
sack_outcome_df['sack_safety'] = (sack_outcome_df['sack_result'] == 'SAFETY')
sack_outcome_df['sack_fumble_lost_notd'] = (sack_outcome_df['sack_result'] == 'FUMBLE_LOST').astype(int)
sack_outcome_df['sack_fumble_lost_td'] = (sack_outcome_df['sack_result'] == 'FUMBLE_LOST_TD').astype(int)
sack_outcome_df['sack_fumble_lost'] = sack_outcome_df['sack_fumble_lost_notd'] + sack_outcome_df['sack_fumble_lost_td']
sack_outcome_df['loss_of_yards'] = 1 - sack_outcome_df['sack_safety'] - sack_outcome_df['sack_fumble_lost']

sack_yards_lost_df = sack_outcome_df[sack_outcome_df['loss_of_yards'] == 1].reset_index(drop=True)
sack_yards_lost_df['yards_lost'] = -1 * sack_yards_lost_df['yards_gained']

nonsack_df = dataset[dataset['sack'] == 0].reset_index(drop=True)
throwaway_prob_df = dataset[(dataset['sack'] == 0) & (dataset['qb_scramble'] == 0)].reset_index(drop=True)

In [None]:
sack_model = LogisticRegression(**LOGR_KWARGS)
sack_model.fit(dataset[dropback_qb_features], dataset['sack'])

sack_safety_model = LogisticRegression(**LOGR_KWARGS)
sack_safety_model.fit(sack_outcome_df[dropback_qb_features], sack_outcome_df['sack_safety'])

sack_fumblelost_model = LogisticRegression(**LOGR_KWARGS)
sack_fumblelost_model.fit(sack_outcome_df[dropback_qb_features], sack_outcome_df['sack_fumble_lost'])

sack_fl_td_model = LogisticRegression(**LOGR_KWARGS)
sack_fl_td_model.fit(sack_outcome_df[sack_outcome_df['sack_fumble_lost'] == 1][dropback_qb_features], sack_outcome_df[sack_outcome_df['sack_fumble_lost'] == 1]['sack_fumble_lost_td'])

sack_yards_lost_model = PoissonRegressor(**POIS_KWARGS)
sack_yards_lost_model.fit(sack_yards_lost_df[dropback_qb_features], sack_yards_lost_df['yards_lost'])
# sack_yards_lost_pred = sack_yards_lost_model.predict(sack_yards_lost_df[dropback_features])
# sack_yards_lost_resid = (sack_yards_lost_pred - sack_yards_lost_df['yards_lost']).var()
# print(f'const SACK_YARDS_LOST_RESID: f32 = {sack_yards_lost_resid:.4f};')

scramble_model = LogisticRegression(**LOGR_KWARGS)
scramble_model.fit(nonsack_df[dropback_qb_features], nonsack_df['qb_scramble'])

throwaway_model = LogisticRegression(**LOGR_KWARGS)
throwaway_model.fit(throwaway_prob_df[dropback_qb_features], throwaway_prob_df['dropback'] == 'THROWAWAY')

In [None]:
ins_scrambles = scramble_model.predict_proba(nonsack_df[dropback_qb_features])
ins_scrambles.mean(axis=0)

In [None]:
models_str_parts = ["use crate::models::dropback::DropbackModel;\n", "impl DropbackModel {"]

models_str_parts.append(format_logr_model(sack_model, 'prob_sack'))
models_str_parts.append(format_logr_model(sack_safety_model, 'prob_sack_safety'))
models_str_parts.append(format_logr_model(sack_fumblelost_model, 'prob_sack_fumble_lost'))
models_str_parts.append(format_logr_model(sack_fl_td_model, 'prob_sack_fumble_lost_td'))
models_str_parts.append(format_linr_model(sack_yards_lost_model, 'sack_yards_lost'))

models_str_parts.append(format_logr_model(scramble_model, 'prob_scramble'))
models_str_parts.append(format_logr_model(throwaway_model, 'prob_throwaway'))
models_str_parts.append("}")

models_str = '\n'.join(models_str_parts)

with open(f'{MODELS_PATH}/dropback/coef.rs', 'w') as f:
    f.write(models_str)


In [None]:
wr_train_df = (
    pass_skill_df
    .merge(qb_features_df.drop(columns=['count']), how='left', on=['season','passer_id'])
    .rename(columns={'ay_oe': 'qb_ay_oe', 'ay_std': 'qb_ay_std', 'comp_oe': 'qb_comp_oe', 'int_ue': 'qb_int_ue'})
)
# hack to aggregate cleaner
wr_train_df['air_yards_std'] = wr_train_df['air_yards']
wr_train_df['qb_def_comp'] = wr_train_df['log_qb_comp_prob'] * wr_train_df['defense_completion_z']
wr_train_df['qb_def_int'] = wr_train_df['log_qb_int_prob'] * wr_train_df['defense_interception_z']

qb_acc_features = ['log_qb_comp_prob', 'log_qb_int_prob']
def_x_features = ['qb_def_comp', 'qb_def_int']
qb_ay_features = ['qb_ay_oe','qb_ay_std', 'log_qb_mean_ay', 'log_qb_std_ay']
qb_throw_features = ['prob_completion', *qb_acc_features, *def_z_features, *def_x_features]

prod_airyards_features = [*dropback_features, *qb_ay_features]
prod_target_features = [*dropback_features, *qb_acc_features, *def_x_features, 'air_yards', 'neg_log_air_yards', 'pos_log_air_yards', 'pos_log_air_yards_sq']

wr_compl_model = LogisticRegression(**LOGR_KWARGS)
wr_compl_model.fit(wr_train_df[qb_throw_features], wr_train_df['complete_pass'])
wr_train_df['prob_catch'] = wr_compl_model.predict_proba(wr_train_df[qb_throw_features])[:,1]
wr_train_df['catch_oe'] = wr_train_df['complete_pass'] - wr_train_df['prob_catch']

wr_train_df['is_pos_air_yards'] = (wr_train_df['air_yards'] > 0).astype(int)
pos_ay_df = wr_train_df[wr_train_df['is_pos_air_yards'] == 1].reset_index(drop=True)
neg_ay_df = wr_train_df[wr_train_df['is_pos_air_yards'] == 0].reset_index(drop=True)

is_pos_air_yards_model = LogisticRegression(**LOGR_KWARGS)
is_pos_air_yards_model.fit(wr_train_df[prod_airyards_features], wr_train_df['is_pos_air_yards'])

neg_air_yards_model = PoissonRegressor(**POIS_KWARGS)
neg_air_yards_model.fit(neg_ay_df[prod_airyards_features], -1 * neg_ay_df['air_yards'])
# neg_ay_pred = neg_air_yards_model.predict(neg_ay_df[prod_airyards_features])
# neg_ay_resid = (-1 * neg_ay_df['air_yards'] - neg_ay_pred).var()
# print(f'const NEG_AIR_YARDS_RESID: f32 = {neg_ay_resid:.4f};')

pos_air_yards_model = PoissonRegressor(**POIS_KWARGS)
pos_air_yards_model.fit(pos_ay_df[prod_airyards_features], pos_ay_df['air_yards'] - 1)
pos_ay_pred = pos_air_yards_model.predict(pos_ay_df[prod_airyards_features])
# pos_ay_resid = (pos_ay_df['air_yards'] - pos_ay_pred).var()
# print(f'const POS_AIR_YARDS_RESID: f32 = {pos_ay_resid:.4f};')

pos_air_yards_var_model = PoissonRegressor(**POIS_KWARGS)
pos_air_yards_var_model.fit(pos_ay_df[prod_airyards_features], (pos_ay_df['air_yards'] - 1 - pos_ay_pred) ** 2)
pos_ay_var_pred = pos_air_yards_var_model.predict(pos_ay_df[prod_airyards_features])
pos_ay_resid = (np.log((pos_ay_df['air_yards'] - pos_ay_pred) ** 2) - pos_ay_var_pred).var()
# print(f'const POS_AIR_YARDS_VAR_RESID: f32 = {pos_ay_resid:.4f};')

In [None]:
# from scipy.stats import nbinom

# # test whether negative binomial distribution is valid enough (it is)

# mu = np.mean(pos_ay_df['air_yards'])
# sigma_sqr = np.var(pos_ay_df['air_yards'])

# n = mu**2 / (sigma_sqr - mu)
# p = mu / sigma_sqr

# distribution = nbinom(n=n, p=p)

# nonneg_airyds_bins = [i for i in range(int(pos_ay_df['air_yards'].min()), 1 + int(pos_ay_df['air_yards'].max()))]
# values = pd.DataFrame([
#     {'x': x, 'y': distribution.pmf(x)}
#     for x in nonneg_airyds_bins
# ])
# values['cdf'] = values['y'].cumsum()
# values['ev'] = values['x'] * values['y']
# print(values['ev'].sum())
# values.plot(x='x', y='y')

In [None]:
# mean_features = wr_train_df[prod_target_features].mean(axis=0).to_dict()
# coefs = dict(zip(prod_target_features, is_pos_air_yards_model.coef_[0]))

# contribs = {
#     f: mean_features[f] * coefs[f]
#     for f in prod_target_features
# }
# # sorted(contribs.items(), key=lambda x: x[1], reverse=True)

In [None]:
PRIOR_TARGET_COUNT = 30

xyac_pbp_df = (
    target_df
    .merge(pbp[['play_id','game_id','yards_after_catch','xyac_mean_yardage']], how='left', on=['play_id','game_id'])
    .dropna()
)

mean_xyac = xyac_pbp_df['xyac_mean_yardage'].mean()

xyac_season_df = (
    xyac_pbp_df
    .groupby(['season', 'receiver_id'], as_index=False)
    .aggregate({'xyac_mean_yardage': 'mean', 'play_id': 'count'})
    .rename(columns={'play_id': 'targets', 'receiver_id': 'player_id'})
)
xyac_season_df['xyac'] = (
    (xyac_season_df['xyac_mean_yardage'] * xyac_season_df['targets'] + PRIOR_TARGET_COUNT * mean_xyac)
    / (xyac_season_df['targets'] + PRIOR_TARGET_COUNT)
)

In [None]:
PRIOR_REC_COUNT = 20

yac_oe_pbp_df = (
    target_df[(target_df['complete_pass'] == 1)]
    .merge(pbp[['play_id','game_id','yards_after_catch','xyac_mean_yardage','pass_touchdown']], how='left', on=['play_id','game_id'])
    .dropna()
)
yac_oe_pbp_df['yac_oe'] = yac_oe_pbp_df['yards_after_catch'] - yac_oe_pbp_df['xyac_mean_yardage']

def_yac_oe_df = (
    yac_oe_pbp_df
    .groupby(['season', 'defteam'], as_index=False)
    .aggregate({'yac_oe': 'mean', 'play_id': 'count'})
    .rename(columns={'play_id': 'catches', 'defteam': 'team', 'yac_oe': 'def_yac_oe'})
)

yac_oe_season_df = (
    yac_oe_pbp_df
    .groupby(['season', 'receiver_id'], as_index=False)
    .aggregate({'yac_oe': 'mean', 'play_id': 'count'})
    .rename(columns={'play_id': 'catches', 'receiver_id': 'player_id'})
)
yac_oe_season_df['yac_oe'] = yac_oe_season_df['yac_oe'] * yac_oe_season_df['catches'] / (yac_oe_season_df['catches'] + PRIOR_REC_COUNT)

In [None]:
wr_rec_skills_df = (
    wr_train_df
    .groupby(['season','receiver_id'], as_index=False)
    .aggregate({'catch_oe': 'mean', 'air_yards': 'mean', 'air_yards_std': 'std', 'play_id': 'count'})
    .rename(columns={'play_id': 'targets', 'air_yards': 'adot', 'air_yards_std': 'adot_std', 'receiver_id': 'player_id'})
)

wr_rec_skills_df.loc[wr_rec_skills_df['adot_std'].isna(), 'adot_std'] = avg_ayoe_std

prior_tgt_count = PRIOR_TARGET_COUNT + wr_rec_skills_df['targets']
wr_rec_skills_df['catch_oe'] = (wr_rec_skills_df['catch_oe'] * wr_rec_skills_df['targets']) / prior_tgt_count
wr_rec_skills_df['adot'] = (PRIOR_TARGET_COUNT * avg_adot + wr_rec_skills_df['adot'] * wr_rec_skills_df['targets']) / prior_tgt_count
wr_rec_skills_df['adot_std'] = np.sqrt((PRIOR_TARGET_COUNT * avg_ayoe_std ** 2 + wr_rec_skills_df['targets'] * wr_rec_skills_df['adot_std'] ** 2) / prior_tgt_count)

wr_features = (
    wr_rec_skills_df
    .merge(xyac_season_df.drop(columns=['targets']), on=['season','player_id'], how='left')
    .merge(yac_oe_season_df, on=['season', 'player_id'], how='left')
)

wr_features.loc[wr_features['catches'].isna(), 'catches'] = 0
wr_features.loc[wr_features['xyac'].isna(), 'xyac'] = mean_xyac
wr_features.loc[wr_features['yac_oe'].isna(), 'yac_oe'] = 0.0

wr_features['log_wr_adot'] = np.log(wr_features['adot'].clip(EPSILON_AIR_YARDS))
wr_features['log_wr_xyac'] = np.log(wr_features['xyac'].clip(EPSILON_AIR_YARDS))
wr_features['wr_yac_oe_sq'] = wr_features['yac_oe'] ** 2

In [None]:
tgt_outcome_features = [
    *prod_target_features,
    'qb_ay_oe', 'log_qb_mean_ay',
    'wr_adot', 'log_wr_adot',
    'wr_xyac', 'log_wr_xyac',
    'wr_yac_oe', 'wr_yac_oe_sq', 'def_yac_oe',
    'log_wr_catch_prob', 'qb_wr_lcp',
]
yac_outcome_features = [*tgt_outcome_features, 'clock_runs_after']
CATCH_OUTCOMES = {'CATCH', 'CATCH_AND_FUMBLE', 'CATCH_AND_PENALTY'}
INT_OUTCOMES = {'INT', 'PICK_SIX'}

target_outcomes_df = (
    wr_train_df
    .drop(columns=['catch_oe'])
    .merge(
        wr_features.rename(columns={
            'adot': 'wr_adot',
            'catch_oe': 'wr_catch_oe',
            'yac_oe': 'wr_yac_oe',
            'xyac': 'wr_xyac',
        }),
        how='left',
        left_on=['season','receiver_id'],
        right_on=['season', 'player_id']
    )
    .merge(
        right=def_yac_oe_df.rename(columns={'team': 'defteam'}),
        how='left',
        on=['season','defteam']
    )
)

target_outcomes_df['pos_log_air_yards'] = np.log(target_outcomes_df['air_yards'].clip(1))
target_outcomes_df['pos_log_air_yards_sq'] = target_outcomes_df['pos_log_air_yards'] ** 2
target_outcomes_df['neg_log_air_yards'] = np.log((-1 * target_outcomes_df['air_yards']).clip(1))
target_outcomes_df['log_wr_catch_prob'] = np.log(PROB_COMPLETION + target_outcomes_df['wr_catch_oe'])
target_outcomes_df['qb_wr_lcp'] = target_outcomes_df['log_qb_comp_prob'] * target_outcomes_df['log_wr_catch_prob']
target_outcomes_df['interception'] = (target_outcomes_df['target_result'].isin({'INT', 'PICK_SIX'})).astype(int)

int_outcome_df = target_outcomes_df[target_outcomes_df['interception'] == 1].reset_index(drop=True)
int_outcome_df['pick_six'] = (int_outcome_df['target_result'] == 'PICK_SIX').astype(int)

target_outcomes_df['completion'] = (target_outcomes_df['target_result'].isin({'CATCH','CATCH_AND_FUMBLE','CATCH_AND_PENALTY'})).astype(int)
no_int_df = target_outcomes_df[target_outcomes_df['interception'] == 0].reset_index(drop=True)
completion_outcome_df = target_outcomes_df[target_outcomes_df['completion'] == 1].reset_index(drop=True)
completion_outcome_df['fumble_lost'] = (completion_outcome_df['target_result'] == 'CATCH_AND_FUMBLE').astype(int)

comp_fumlost_df = completion_outcome_df[completion_outcome_df['fumble_lost'] == 1].reset_index(drop=True)
comp_no_fumlost_df = completion_outcome_df[completion_outcome_df['fumble_lost'] == 0].reset_index(drop=True)

comp_yards_df = (
    comp_no_fumlost_df[comp_no_fumlost_df['touchdown'] == 0]
    .merge(pbp[['play_id','game_id','yards_after_catch']], how='left', on=['play_id','game_id'])
    .dropna()
)

comp_yards_df['is_positive_yac'] = (comp_yards_df['yards_after_catch'] > 0).astype(int)

neg_yac_df = comp_yards_df[comp_yards_df['is_positive_yac'] == 0].reset_index(drop=True)
pos_yac_df = comp_yards_df[comp_yards_df['is_positive_yac'] == 1].reset_index(drop=True)

In [None]:
# target_outcomes_df.groupby(['air_yards'], as_index=False).aggregate({'completion': 'mean'}).plot(x='air_yards', y='completion', kind='scatter')

In [None]:
ay_cond_df = target_outcomes_df.copy().rename(columns={'wr_adot': 'adot', 'wr_xyac': 'xyac', 'wr_yac_oe': 'yac_oe'})
ay_cond_df['is_ay_pos'] = (ay_cond_df['air_yards'] > 0).astype(int)

ay_prob_features = ['zero_sigma', 'neg1_sigma', 'neg2_sigma', 'xyac', 'yac_oe']
ay_cond_features = ['adot', 'adot_std', 'xyac', 'yac_oe']
ay_cond_df['zero_sigma'] = norm.cdf(0, loc=ay_cond_df['adot'], scale=ay_cond_df['adot_std'])
ay_cond_df['neg1_sigma'] = norm.cdf(0, ay_cond_df['adot'] - ay_cond_df['adot_std'], scale=ay_cond_df['adot_std'])
ay_cond_df['neg2_sigma'] = norm.cdf(0, ay_cond_df['adot'] - 2 * ay_cond_df['adot_std'], scale=ay_cond_df['adot_std'])

is_pos_ay_tgt = LogisticRegression(**LOGR_KWARGS)
is_pos_ay_tgt.fit(ay_cond_df[ay_prob_features], ay_cond_df['is_ay_pos'])

ay_neg_cond_df = ay_cond_df[ay_cond_df['is_ay_pos'] == 0].reset_index(drop=True)

ay_neg_cond_mean_model = PoissonRegressor(**POIS_KWARGS)
ay_neg_cond_mean_model.fit(ay_neg_cond_df[ay_cond_features], -1 * ay_neg_cond_df['air_yards'])
# ay_neg_cond_mean_pred = ay_neg_cond_mean_model.predict(ay_neg_cond_df[ay_cond_features])
# ay_neg_cond_mean_resid = (-1 * ay_neg_cond_df['air_yards'] - ay_neg_cond_mean_pred).var()
# print(f'const AY_NEG_COND_MEAN_RESID: f32 = {ay_neg_cond_mean_resid:.4f};')

ay_pos_cond_df = ay_cond_df[ay_cond_df['is_ay_pos'] == 1].reset_index(drop=True)
prob_pos_ay_tgt = is_pos_ay_tgt.predict_proba(ay_pos_cond_df[ay_prob_features])[:,1]
neg_cond_mean = ay_neg_cond_mean_model.predict(ay_pos_cond_df[ay_cond_features])
pos_cond_mean = (ay_pos_cond_df['adot'] + neg_cond_mean * (1 - prob_pos_ay_tgt)) / prob_pos_ay_tgt

ay_pos_cond_var_model = PoissonRegressor(**POIS_KWARGS)
ay_pos_cond_var_model.fit(ay_pos_cond_df[ay_cond_features], (ay_pos_cond_df['air_yards'] - pos_cond_mean) ** 2)
# ay_pos_cond_var_pred = ay_pos_cond_var_model.predict(ay_pos_cond_df[ay_cond_features])
# ay_pos_cond_var_resid = ((ay_pos_cond_df['air_yards'] - pos_cond_mean) ** 2 - ay_pos_cond_var_pred).var()
# print(f'const AY_POS_COND_VAR_RESID: f32 = {ay_pos_cond_var_resid:.4f};')

In [None]:
def format_is_pos_ay_tgt_model(model: LogisticRegression, name: str) -> str:
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> PositiveAirYardsTargetModel {{')
    ret.append('        PositiveAirYardsTargetModel {')
    ret.append(f'            intercept: {model.intercept_[0]:.4f},')
    for f, c in zip(ay_prob_features, model.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_cond_ay_tgt_model(model: PoissonRegressor, name: str) -> str:
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> ConditionalAirYardsModel {{')
    ret.append('        ConditionalAirYardsModel {')
    ret.append(f'            intercept: {model.intercept_:.4f},')
    for f, c in zip(ay_cond_features, model.coef_):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

In [None]:
tgt_int_model = LogisticRegression(**LOGR_KWARGS)
tgt_int_model.fit(target_outcomes_df[tgt_outcome_features], target_outcomes_df['interception'])

tgt_int_td_model = LogisticRegression(**LOGR_KWARGS)
tgt_int_td_model.fit(int_outcome_df[tgt_outcome_features], int_outcome_df['pick_six'])

no_pick6 = int_outcome_df['pick_six'] == 0
is_int_pos_yards_df = int_outcome_df[no_pick6].copy()
is_int_pos_yards_df['is_pos_yards'] = (is_int_pos_yards_df['return_yards'] > 0).astype(int)

tgt_int_yards_is_pos = LogisticRegression(**LOGR_KWARGS)
tgt_int_yards_is_pos.fit(is_int_pos_yards_df[tgt_outcome_features], is_int_pos_yards_df['is_pos_yards'])

neg_int_yards_df = is_int_pos_yards_df[is_int_pos_yards_df['is_pos_yards'] == 0].reset_index(drop=True)
tgt_int_neg_yards_model = PoissonRegressor(**POIS_KWARGS)
tgt_int_neg_yards_model.fit(neg_int_yards_df[tgt_outcome_features], -1 * neg_int_yards_df['return_yards'])
# tgt_int_neg_yards_pred = tgt_int_neg_yards_model.predict(neg_int_yards_df[tgt_outcome_features])
# tgt_int_neg_yards_resid = (-1 * neg_int_yards_df['return_yards'] - tgt_int_neg_yards_pred).var()
# print(f'const INT_NEG_YARDS_RESID: f32 = {tgt_int_neg_yards_resid:.4f};')

pos_int_yards_df = is_int_pos_yards_df[is_int_pos_yards_df['is_pos_yards'] == 1].reset_index(drop=True)
tgt_int_pos_yards_model = PoissonRegressor(**POIS_KWARGS)
tgt_int_pos_yards_model.fit(pos_int_yards_df[tgt_outcome_features], pos_int_yards_df['return_yards'] - 1)
# tgt_int_pos_yards_pred = tgt_int_pos_yards_model.predict(pos_int_yards_df[tgt_outcome_features])
# tgt_int_pos_yards_resid = (pos_int_yards_df['return_yards'] - tgt_int_pos_yards_pred).var()
# print(f'const INT_POS_YARDS_RESID: f32 = {tgt_int_pos_yards_resid:.4f};')

tgt_int_pos_mean = tgt_int_pos_yards_model.predict(pos_int_yards_df[tgt_outcome_features])
tgt_int_pos_yards_var_model = PoissonRegressor(**POIS_KWARGS)
tgt_int_pos_yards_var_model.fit(pos_int_yards_df[tgt_outcome_features], (pos_int_yards_df['return_yards'] - 1 - tgt_int_pos_mean) ** 2)
# tgt_int_pos_yards_var_pred = tgt_int_pos_yards_var_model.predict(pos_int_yards_df[tgt_outcome_features])
# tgt_int_pos_yards_var_resid = (np.log((pos_int_yards_df['return_yards'] - tgt_int_pos_mean) ** 2) - tgt_int_pos_yards_var_pred).var()
# print(f'const INT_POS_YARDS_VAR_RESID: f32 = {tgt_int_pos_yards_var_resid:.4f};')

In [None]:
tgt_completion_model = LogisticRegression(**LOGR_KWARGS)
tgt_completion_model.fit(no_int_df[tgt_outcome_features], no_int_df['completion'])

# NOTE: if caught in the end zone, we terminate early in production

# see if there's a fumble lost
tgt_fumlost_model = LogisticRegression(**LOGR_KWARGS)
tgt_fumlost_model.fit(completion_outcome_df[tgt_outcome_features], completion_outcome_df['fumble_lost'])

# see if it's returned for a TD
tgt_fumlost_td_model = LogisticRegression(**LOGR_KWARGS)
tgt_fumlost_td_model.fit(comp_fumlost_df[tgt_outcome_features], comp_fumlost_df['touchdown'])

# see if the catch goes for a TD
tgt_td_model = LogisticRegression(**LOGR_KWARGS)
tgt_td_model.fit(comp_no_fumlost_df[tgt_outcome_features], comp_no_fumlost_df['touchdown'])

In [None]:
tgt_clock_runs_after_model = LogisticRegression(**LOGR_KWARGS)
tgt_clock_runs_after_model.fit(comp_yards_df[tgt_outcome_features], comp_yards_df['clock_runs_after'])

tgt_is_pos_yac_model = LogisticRegression(**LOGR_KWARGS)
tgt_is_pos_yac_model.fit(comp_yards_df[yac_outcome_features], comp_yards_df['is_positive_yac'])

tgt_neg_yac_model = PoissonRegressor(**POIS_KWARGS)
tgt_neg_yac_model.fit(neg_yac_df[yac_outcome_features], -1 * neg_yac_df['yards_after_catch'])
# tgt_neg_yac_pred = tgt_neg_yac_model.predict(neg_yac_df[yac_outcome_features])
# tgt_neg_yac_resid = (-1 * neg_yac_df['yards_after_catch'] - tgt_neg_yac_pred).var()
# print(f'const NEG_YAC_RESID: f32 = {tgt_neg_yac_resid:.4f};')

tgt_pos_yac_model = PoissonRegressor(**POIS_KWARGS)
tgt_pos_yac_model.fit(pos_yac_df[yac_outcome_features], pos_yac_df['yards_after_catch'] - 1)
tgt_pos_yac_mean = tgt_pos_yac_model.predict(pos_yac_df[yac_outcome_features])
# tgt_pos_yac_resid = (pos_yac_df['yards_after_catch'] - tgt_pos_yac_mean).var()
# print(f'const POS_YAC_RESID: f32 = {tgt_pos_yac_resid:.4f};')

tgt_pos_yac_var_model = PoissonRegressor(**POIS_KWARGS)
tgt_pos_yac_var_model.fit(pos_yac_df[yac_outcome_features], (pos_yac_df['yards_after_catch'] - 1 - tgt_pos_yac_mean) ** 2)
# tgt_pos_yac_var_pred = tgt_pos_yac_var_model.predict(pos_yac_df[yac_outcome_features])
# tgt_pos_yac_var_resid = ((pos_yac_df['yards_after_catch'] - tgt_pos_yac_mean) ** 2 - tgt_pos_yac_var_pred).var()
# print(f'const POS_YAC_VAR_RESID: f32 = {tgt_pos_yac_var_resid:.4f};')

In [None]:
def format_ay_linr_model(model_: PoissonRegressor, name: str):
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> AirYardsModel {{')
    ret.append('        AirYardsModel {')
    ret.append(f'            intercept: {model_.intercept_:.4f},')
    for f, c in zip(prod_airyards_features, model_.coef_):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_ay_logr_model(model_: LogisticRegression, name: str):
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> AirYardsModel {{')
    ret.append('        AirYardsModel {')
    ret.append(f'            intercept: {model_.intercept_[0]:.4f},')
    for f, c in zip(prod_airyards_features, model_.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_tgt_logr_model(model_: LogisticRegression, name: str):
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> TargetModel {{')
    ret.append('        TargetModel {')
    ret.append(f'            intercept: {model_.intercept_[0]:.4f},')
    for f, c in zip(yac_outcome_features, model_.coef_[0]):
        ret.append(f'            {f}: {c:.4f},')
    for f in yac_outcome_features[model_.coef_[0].size:]:
        ret.append(f'            {f}: 0.0,')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

def format_tgt_linr_model(model_: PoissonRegressor, name: str):
    ret = []
    ret.append(f'\n    pub fn {name}_coef() -> TargetModel {{')
    ret.append('        TargetModel {')
    ret.append(f'            intercept: {model_.intercept_:.4f},')
    for f, c in zip(yac_outcome_features, model_.coef_):
        ret.append(f'            {f}: {c:.4f},')
    for f in yac_outcome_features[model_.coef_.size:]:
        ret.append(f'            {f}: 0.0,')
    ret.append('        }')
    ret.append('    }\n')
    return '\n'.join(ret)

In [None]:
# print("pub struct AirYardsModel {")
# print("    intercept: f32,")
# for f in prod_airyards_features:
#     print(f'    {f}: f32,')
# print("}")

# print("pub struct TargetModel {")
# print("    intercept: f32,")
# for f in prod_target_features:
#     print(f'    {f}: f32,')
# print("}")

# print("pub struct PositiveAirYardsTargetModel {")
# print("    intercept: f32,")
# for f in ay_prob_features:
#     print(f'    {f}: f32,')
# print("}")

# print("pub struct ConditionalAirYardsModel")
# print("    intercept: f32,")
# for f in ay_cond_features:
#     print(f'    {f}: f32,')
# print("}")

In [None]:
ay_model_str_parts = ["use crate::models::air_yards::AirYardsModel;\n", "impl AirYardsModel {"]

ay_model_str_parts.append(format_ay_logr_model(is_pos_air_yards_model, 'is_pos_air_yards'))
ay_model_str_parts.append(format_ay_linr_model(neg_air_yards_model, 'neg_air_yards'))
ay_model_str_parts.append(format_ay_linr_model(pos_air_yards_model, 'pos_air_yards'))
ay_model_str_parts.append(format_ay_linr_model(pos_air_yards_var_model, 'pos_air_yards_var'))
ay_model_str_parts.append("}")

ay_model_str = '\n'.join(ay_model_str_parts)

with open(f'{MODELS_PATH}/air_yards/coef.rs', 'w') as f:
    f.write(ay_model_str)

In [None]:
tgt_model_str_parts = ["use crate::models::targets::{TargetModel, ConditionalAirYardsModel, PositiveAirYardsTargetModel};\n"]

tgt_model_str_parts.append("impl TargetModel {")
# interception? if so, predict pick 6 or return yards distribution
tgt_model_str_parts.append(format_tgt_logr_model(tgt_int_model, 'prob_int'))
tgt_model_str_parts.append(format_tgt_logr_model(tgt_int_td_model, 'prob_pick_six'))
tgt_model_str_parts.append(format_tgt_logr_model(tgt_int_yards_is_pos, 'is_int_return_yards_pos'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_int_neg_yards_model, 'int_neg_return_yards'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_int_pos_yards_model, 'int_pos_return_yards'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_int_pos_yards_var_model, 'int_pos_return_yards_var'))
# completion? if so, predict result
tgt_model_str_parts.append(format_tgt_logr_model(tgt_completion_model, 'prob_completion'))
# assume this is 0 in production. so rarely happens...
# tgt_model_str_parts.append(format_tgt_logr_model(tgt_fumlost_model, 'prob_catch_fumble_lost'))
# tgt_model_str_parts.append(format_tgt_logr_model(tgt_fumlost_td_model, 'prob_catch_fumble_lost_td'))
tgt_model_str_parts.append(format_tgt_logr_model(tgt_td_model, 'prob_catch_td'))

tgt_model_str_parts.append(format_tgt_logr_model(tgt_clock_runs_after_model, 'clock_runs_after'))
tgt_model_str_parts.append(format_tgt_logr_model(tgt_is_pos_yac_model, 'is_pos_yac'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_neg_yac_model, 'neg_yac'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_pos_yac_model, 'pos_yac'))
tgt_model_str_parts.append(format_tgt_linr_model(tgt_pos_yac_var_model, 'pos_yac_var'))
tgt_model_str_parts.append("}")

tgt_model_str_parts.append("impl PositiveAirYardsTargetModel {")
tgt_model_str_parts.append(format_is_pos_ay_tgt_model(is_pos_ay_tgt, 'is_positive_ay_target'))
tgt_model_str_parts.append("}")

tgt_model_str_parts.append("impl ConditionalAirYardsModel {")
tgt_model_str_parts.append(format_cond_ay_tgt_model(ay_neg_cond_mean_model, 'neg_ay_target_mean'))
tgt_model_str_parts.append(format_cond_ay_tgt_model(ay_pos_cond_var_model, 'pos_ay_target_variance'))
tgt_model_str_parts.append("}")

tgt_model_str = '\n'.join(tgt_model_str_parts)

with open(f'{MODELS_PATH}/targets/coef.rs', 'w') as f:
    f.write(tgt_model_str)

In [None]:
def weight_team_features(season: int) -> float:
    if season < CURRENT_SEASON - 2:
        return 0
    if season == CURRENT_SEASON:
        return 1.0
    if season == CURRENT_SEASON - 1:
        return 0.5
    if season == CURRENT_SEASON - 2:
        return 0.1
    return 0.0

def season_weight_decay(decay: float, cutoff_delta: int = 10):
    def _inner(season: int):
        if season + cutoff_delta < CURRENT_SEASON:
            return 0.0
        return math.exp(decay * (season - CURRENT_SEASON))
    return _inner

In [None]:
n_games = (
    pbp
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'game_id': 'nunique'})
    .rename(columns={'game_id': 'n_games', 'posteam': 'team'})
)

proj_team_df = (
    n_games
    .merge(team_completion_features, on=['season','team'])
    .merge(joined_line_features, on=['season','team'])
    .merge(def_yac_oe_df, on=['season','team'])
)

proj_team_df['weight'] = proj_team_df['n_games'] * proj_team_df['season'].apply(season_weight_decay(1.0))
tm_ft_cols = ['offense_pass_rush_z','defense_pass_rush_z','def_comp_z','def_int_z', 'def_yac_oe']
for c in tm_ft_cols:
    proj_team_df[c] = proj_team_df[c] * proj_team_df['weight']

proj_team_features_df = (
    proj_team_df
    .groupby(['team'], as_index=False)
    .aggregate({
        'weight': 'sum',
        **{c: 'sum' for c in tm_ft_cols},
    })
    .sort_values('weight', ascending=False)
)
for c in tm_ft_cols:
    proj_team_features_df[c] = (proj_team_features_df[c] / proj_team_features_df['weight']).round(decimals=2)

proj_team_features_df.drop(columns=['weight'], inplace=True)
proj_team_features_df.rename(inplace=True, columns={'def_comp_z': 'defense_completion_z', 'def_int_z': 'defense_interception_z', 'def_yac_oe': 'defense_yac_oe'})

proj_team_features_df.to_csv(f'{BASELINES_PATH}/team_dropbacks.csv', index=False)
proj_team_features_df.head()

In [None]:
proj_team_features_df

In [None]:
preproj_qb_features = qb_features_df.merge(qb_sack_avoidance, on=['passer_id','season'])
preproj_qb_features['airyards_weight'] = preproj_qb_features['count'] * preproj_qb_features['season'].apply(season_weight_decay(0.8))
preproj_qb_features['accuracy_weight'] = preproj_qb_features['count'] * preproj_qb_features['season'].apply(season_weight_decay(1.0))

preproj_qb_features['comp_oe_wt'] = preproj_qb_features['accuracy_weight'] * preproj_qb_features['comp_oe']
preproj_qb_features['int_ue_wt'] = preproj_qb_features['accuracy_weight'] * preproj_qb_features['int_ue']

preproj_qb_features['air_yards_oe_wt'] = preproj_qb_features['airyards_weight'] * preproj_qb_features['ay_oe']
preproj_qb_features['air_yards_var_wt'] = preproj_qb_features['airyards_weight'] * preproj_qb_features['ay_std'] ** 2

preproj_qb_features['qb_prob_sack_given_hit_wt'] = preproj_qb_features['accuracy_weight'] * preproj_qb_features['qb_prob_sack_given_hit']

qb_projections = (
    preproj_qb_features
    .groupby(['passer_id'], as_index=False)
    .aggregate({
        'airyards_weight': 'sum',
        'accuracy_weight': 'sum',
        'comp_oe_wt': 'sum',
        'int_ue_wt': 'sum',
        'air_yards_oe_wt': 'sum',
        'air_yards_var_wt': 'sum',
        'qb_prob_sack_given_hit_wt': 'sum',
        'count': 'sum',
    })
)
qb_projections['comp_oe'] = (qb_projections['comp_oe_wt'] / qb_projections['accuracy_weight']).round(decimals=3)
qb_projections['int_ue'] = (qb_projections['int_ue_wt'] / qb_projections['accuracy_weight']).round(decimals=3)

qb_projections['ayoe'] = (qb_projections['air_yards_oe_wt'] / qb_projections['airyards_weight']).round(decimals=1)
qb_projections['ay_std'] = (np.sqrt(qb_projections['air_yards_var_wt'] / qb_projections['airyards_weight'])).round(decimals=1)
qb_projections['prob_sack_given_hit'] = (qb_projections['qb_prob_sack_given_hit_wt'] / qb_projections['accuracy_weight']).round(decimals=2)

qbpr_df = (
    qb_projections
    .rename(columns={'comp_oe': 'cpoe', 'passer_id': 'player_id'})
    .drop(columns=['airyards_weight', 'accuracy_weight', 'comp_oe_wt', 'int_ue_wt', 'air_yards_oe_wt', 'air_yards_var_wt', 'qb_prob_sack_given_hit_wt', 'count'])
)
# TODO: target shares
write_qb_projs = (
    rosters[['player_id','team']]
    .merge(player_names, on=['player_id'], how='inner')
    .merge(qbpr_df, on=['player_id'], how='inner')
    .sort_values('team', ascending=True)
)

write_qb_projs.to_csv(f'{BASELINES_PATH}/qbs.csv', index=False)

In [None]:
# so guys who have 0 targets this year count as 0
each_teams_games = (
    target_df[target_df['season'] == CURRENT_SEASON]
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'game_id': 'nunique'})
    .rename(columns={'posteam': 'team', 'game_id': 'n_games'})
)
this_season_fake = rosters[['player_id','team']].merge(each_teams_games, on=['team'])
this_season_fake['season'] = CURRENT_SEASON
this_season_fake['targets'] = 0
this_season_fake['is_redzone'] = 0

In [None]:
CURRENT_SEASON_ZERO_MULT = 0.5
DISPERSION_COEF = 1.0
PRIOR_TARGETS = 20

team_tgt_sums = (
    target_df
    .groupby(['season','posteam'], as_index=False)
    .aggregate({'play_id': 'count'})
    .rename(columns={'play_id': 'team_targets', 'posteam': 'team'})
)

target_df['is_redzone'] = (target_df['yardline_100'] <= 20).astype(int)
prior_rz = target_df['is_redzone'].mean()
prev_target_counts = (
    target_df
    .groupby(['season','posteam','receiver_id'], as_index=False)
    .aggregate({'is_redzone': 'sum', 'play_id': 'count', 'game_id': 'nunique'})
    .rename(columns={
        'play_id': 'targets',
        'posteam': 'team',
        'receiver_id': 'player_id',
        'game_id': 'n_games',
    })
)

adj_target_counts = (
    pd.concat([prev_target_counts, this_season_fake])
    .drop_duplicates(subset=['season','team','player_id'], keep='first')
)

ts_groups = adj_target_counts.merge(team_tgt_sums, on=['season','team'])
ts_groups['ms_targets'] = ts_groups['targets'] / ts_groups['team_targets']
ts_groups['prob_rz_given_target'] = (prior_rz * PRIOR_TARGETS + ts_groups['is_redzone']) / (ts_groups['targets'] + PRIOR_TARGETS)
ts_groups['season_weight'] = ts_groups['n_games'] * ts_groups['season'].apply(season_weight_decay(1.2, 4))
ts_groups['weighted_ms_targets'] = ts_groups['ms_targets'] * ts_groups['season_weight']
ts_groups['weighted_prob_rz_given_target'] = ts_groups['prob_rz_given_target'] * ts_groups['season_weight']

zero_this_year = ts_groups[(ts_groups['season'] == CURRENT_SEASON) & (ts_groups['ms_targets'] == 0)][['player_id']]
zero_this_year['current_season_zero'] = 1.0

proj_raw_shares = (
    ts_groups
    .groupby(['player_id',], as_index=False)
    .aggregate({
        'weighted_prob_rz_given_target': 'sum',
        'weighted_ms_targets': 'sum',
        'season_weight': 'sum',
    })
    .merge(zero_this_year, on=['player_id'], how='left')
    .fillna({'current_season_zero': 0.0})
)

proj_raw_shares['ms_targets'] = proj_raw_shares['weighted_ms_targets'] / proj_raw_shares['season_weight']
proj_raw_shares['prob_rz_given_target'] = proj_raw_shares['weighted_prob_rz_given_target'] / proj_raw_shares['season_weight']
proj_raw_shares.loc[proj_raw_shares['current_season_zero'] == 1, 'ms_targets'] *= CURRENT_SEASON_ZERO_MULT
proj_raw_shares.drop(inplace=True, columns=[
    'weighted_prob_rz_given_target',
    'weighted_ms_targets',
    'season_weight',
])

roster_shares = rosters[['player_id','team']].merge(proj_raw_shares, on=['player_id'], how='inner')
team_denoms_disp = roster_shares.groupby(['team'], as_index=False).aggregate({'ms_targets': 'sum'}).rename(columns={'ms_targets': 'team_ms_sum'})

dpns = roster_shares.merge(team_denoms_disp, on=['team'], how='left')
dpns['ms_targets'] = dpns['ms_targets'] ** (DISPERSION_COEF * dpns['team_ms_sum'])

team_denoms = (
    dpns
    .groupby(['team'], as_index=False)
    .aggregate({'ms_targets': 'sum'})
    .rename(columns={'ms_targets': 'team_ms_sum'})
)

proj_normed_shares = (
    dpns
    .drop(columns=['current_season_zero','team_ms_sum'])
    .merge(team_denoms, on=['team'], how='left')
    .sort_values(['team', 'ms_targets'], ascending=[True, False])
)
proj_normed_shares['ms_targets'] /= proj_normed_shares['team_ms_sum']
proj_normed_shares['ms_targets'] = proj_normed_shares['ms_targets'].round(decimals=3)
proj_normed_shares['prob_rz_given_target'] = proj_normed_shares['prob_rz_given_target'].round(decimals=3)
proj_normed_shares.drop(columns=['team_ms_sum'], inplace=True)
# proj_normed_shares.head()

In [None]:
preproj_wr_features = wr_features.copy()

preproj_wr_features['adot_weight'] = preproj_wr_features['targets'] * preproj_wr_features['season'].apply(season_weight_decay(1.0))
preproj_wr_features['catches_weight'] = preproj_wr_features['catches'] * preproj_wr_features['season'].apply(season_weight_decay(0.8))
preproj_wr_features['target_weight'] = preproj_wr_features['targets'] * preproj_wr_features['season'].apply(season_weight_decay(0.8))

preproj_wr_features['xyac_wt'] = preproj_wr_features['target_weight'] * preproj_wr_features['xyac']
preproj_wr_features['yac_oe_wt'] = preproj_wr_features['catches_weight'] * preproj_wr_features['yac_oe']
preproj_wr_features['catch_oe_wt'] = preproj_wr_features['target_weight'] * preproj_wr_features['catch_oe']
preproj_wr_features['adot_wt'] = preproj_wr_features['adot_weight'] * preproj_wr_features['adot']
preproj_wr_features['adot_var_wt'] = preproj_wr_features['adot_weight'] * preproj_wr_features['adot_std'] ** 2

wr_projections = (
    preproj_wr_features
    .groupby(['player_id'], as_index=False)
    .aggregate({
        'adot_weight': 'sum',
        'catches_weight': 'sum',
        'target_weight': 'sum',
        'xyac_wt': 'sum',
        'yac_oe_wt': 'sum',
        'catch_oe_wt': 'sum',
        'adot_wt': 'sum',
        'adot_var_wt': 'sum',
        'targets': 'sum',
        'catches': 'sum',
    })
)

wr_projections['xyac'] = (wr_projections['xyac_wt'] / wr_projections['target_weight']).round(decimals=2)
wr_projections['yac_oe'] = (wr_projections['yac_oe_wt'] / wr_projections['catches_weight']).round(decimals=2)
wr_projections['prob_catch_oe'] = (wr_projections['catch_oe_wt'] / wr_projections['target_weight']).round(decimals=2)
wr_projections['adot'] = (wr_projections['adot_wt'] / wr_projections['adot_weight']).round(decimals=1)
wr_projections['adot_std'] = (np.sqrt(wr_projections['adot_var_wt'] / wr_projections['adot_weight'])).round(decimals=1)

wrsk_df = (
    wr_projections
    .rename(columns={'receiver_id': 'player_id'})
    .drop(columns=[
        'adot_weight','catches_weight','target_weight',
        'xyac_wt', 'yac_oe_wt','catch_oe_wt','adot_wt','adot_var_wt', 'catches'
    ])
)

write_wr_projs = (
    rosters[['player_id','team']]
    .merge(player_names, on=['player_id'], how='inner')
    .merge(proj_normed_shares, on=['player_id','team'], how='inner')
    .merge(wrsk_df, on=['player_id'], how='inner')
    .sort_values(['team', 'position'], ascending=True)
    [['player_id','team','name','position','ms_targets','prob_rz_given_target','prob_catch_oe','adot', 'adot_std', 'xyac', 'yac_oe', 'targets']]
)

write_wr_projs.to_csv(f'{BASELINES_PATH}/pass_catchers.csv', index=False)

In [None]:
write_wr_projs.sort_values('ms_targets', ascending=False).head(10)