In [1]:
import pandas as pd
import numpy as np
import nfl_data_py as nfl
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import poisson

In [2]:
seasons = range(2016, 2022 + 1)

pbp_py = nfl.import_pbp_data(seasons)

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


In [3]:
# use the standard library 're' for text pattern matching (penalties in play descriptions),
import re
import pandas as pd
import numpy as np

# penalty regex (compiled, case-insensitive)
# we precompile a regex that matches *defensive* penalties that can manufacture an incompletion:
# - Defensive Pass Interference (DPI)
# - Illegal Contact
# - Defensive Holding

# for official pass attempts, if an incompletion only exists because of an ACCEPTED defensive foul
# like these, we drop that attempt (it’s not a "real" incomplete pass outcome).
RE_DPI_LIKE = re.compile(
    r"(defensive\s+pass\s+interference|illegal\s+contact|defensive\s+holding)",
    re.I  # case-insensitive
)

# make sure a boolean "flag" column exists as 0/1 integers.
# many football feeds have missing columns or NaNs—this normalizes them to consistent 0/1 ints.
def _ensure_flag(df: pd.DataFrame, col: str) -> None:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].fillna(0).astype(int)

# compute PFR/ESPN-style pass attempts attributable to a specific QB on a specific team
# in a given (season, week). Optionally return a details DataFrame of the plays counted.
def count_qb_pass_attempts_pfr(df: pd.DataFrame,
                               season: int,
                               week: int,
                               qb_name: str,
                               team: str,
                               return_details: bool = False):
    """
    PFR/ESPN-style official pass attempts for (season, week, qb_name, team).

    Drop:
      - two_point_attempt == 1
      - play_type == 'no_play'
      - text contains 'no play'
      - offsetting penalties
      - accepted DPI / illegal contact / defensive holding ONLY when the pass is INCOMPLETE

    Keep:
      - all other accepted penalties (offense or defense), i.e., the attempt still counts.

    Team 'gold' attempts = UNION of:
      - (pass_attempt == 1 AND sack == 0)
      - OR (complete OR incomplete OR interception OR spike OR intentional_grounding OR throwaway)

    QB attribution:
      - (passer == qb_name) OR (qb_spike == 1 AND posteam == team)

    Final = INTERSECTION(team gold, QB attribution) on (game_id, play_id).
    """

    # scope down to the specific game week to reduce noise and speed up operations.
    base = df[(df['season'] == season) & (df['week'] == week)].copy()

    # normalize all the "flag" columns we might reference to clean 0/1 integers.
    # this prevents KeyError and ensures NaNs don't break boolean logic.
    for c in [
        'pass_attempt','two_point_attempt','penalty','complete_pass','incomplete_pass',
        'interception','qb_spike','sack','intentional_grounding','qb_throwaway','throwaway'
    ]:
        _ensure_flag(base, c)

    # some feeds use both 'qb_throwaway' and 'throwaway'. Merge them into 'qb_throwaway'
    # so we have a single, consistent indicator.
    if 'throwaway' in base.columns:
        base['qb_throwaway'] = ((base['qb_throwaway'] == 1) | (base['throwaway'] == 1)).astype(int)

    # create readable aliases for common outcomes and text fields.
    # this makes the boolean logic below easier to follow.
    desc = base['desc'].fillna("")
    complete = base['complete_pass'] == 1
    incomp   = base['incomplete_pass'] == 1
    picked   = base['interception']   == 1
    spike    = base['qb_spike']       == 1
    ground   = base['intentional_grounding'] == 1
    throwwy  = base['qb_throwaway']   == 1

    # penalty/text guards
    # we separate accepted vs declined by scanning the text. offsetting also voids the play.
    # 'no play' (either as play_type or text) means the entire play is nullified for official stats.
    is_pen       = base['penalty'] == 1
    is_declined  = desc.str.contains(r"\bdeclined\b", case=False, na=False)
    offsetting   = desc.str.contains(r"\boffsetting\b", case=False, na=False)
    no_play_text = desc.str.contains(r"\bno play\b", case=False, na=False)

    # identify accepted DPI-like defensive penalties in the description.
    # we will ONLY drop them when the recorded outcome is an incompletion.
    # if an incompletion exists because of an accepted defensive foul (like DPI),
    # that "manufactured" incomplete should not count against the passer as an attempt outcome.
    dpi_like = desc.apply(lambda s: bool(RE_DPI_LIKE.search(s)))
    drop_dpi_incomp = (is_pen & (~is_declined) & dpi_like & incomp)

    # global guard to exclude plays that cannot count as official pass attempts.
    # we exclude:
    #  - two-point tries
    #  - any flavor of "no play" (field or text)
    #  - offsetting penalties
    #  - accepted DPI-like *only when* the pass is incomplete (manufactured incompletion)
    guards = (
        (base['two_point_attempt'] != 1) &
        (base['play_type'] != 'no_play') &
        (~no_play_text) &
        (~offsetting) &
        (~drop_dpi_incomp)
    )
    # note: We keep all other accepted penalties. official stats typically still count the attempt.

    # define what counts as a "team-level attempt opportunity" (team_gold).
    # there are two ways a play can be an attempt for the team:
    # (A) The feed explicitly flags a pass attempt AND it wasn't a sack (sacks are not pass attempts),
    # OR
    # (B) The outcome unambiguously implies a forward pass happened (completion, incompletion,
    #     interception, spike, intentional grounding, or throwaway).
    outcome_attempt = (complete | incomp | picked | spike | ground | throwwy)
    feed_attempt_no_sack = (base['pass_attempt'] == 1) & (base['sack'] == 0)

    # filter to the posteam and guard out disqualified plays.
    # keep only the minimal identifiers to join later and drop duplicates for safety.
    team_gold = base.loc[
        guards & ((feed_attempt_no_sack) | (outcome_attempt)) & (base['posteam'] == team),
        ['game_id','play_id']
    ].drop_duplicates()

    # QB attribution. a play belongs to the QB if:
    #  - the 'passer' matches the QB name, OR
    #  - it is a spike by the posteam’s QB (some feeds don’t fill 'passer' for spikes consistently).
    qb_attr = base.loc[
        guards & ( (base['passer'] == qb_name) | ((base['qb_spike'] == 1) & (base['posteam'] == team)) ),
        ['game_id','play_id']
    ].drop_duplicates()

    # the final counted attempts are the intersection of (team_gold) and (qb_attr).
    # this guarantees: (1) it was a valid team attempt, and (2) it is attributable to our QB.
    final_ids = team_gold.merge(qb_attr, on=['game_id','play_id'], how='inner')

    # if details aren't requested, just return the count (official attempts for this QB).
    if not return_details:
        return len(final_ids)

    # build a details frame for inspection
    # we merge back onto the base week data to bring along all relevant columns.
    det = base.merge(final_ids, on=['game_id','play_id']).copy()

    # for clarity, create a human-readable "bucket" for the attempt cause/outcome.
    # the order matters in np.select: the first True condition wins.
    det['bucket'] = np.select(
        [
            det['qb_spike'] == 1,
            det['complete_pass'] == 1,
            det['incomplete_pass'] == 1,
            det['interception'] == 1,
            det['intentional_grounding'] == 1,
            det['qb_throwaway'] == 1,
            (det['pass_attempt'] == 1) & (det['sack'] == 0)
        ],
        [
            'spike',
            'complete',
            'incomplete',
            'interception',
            'intentional_grounding',
            'throwaway',
            'feed_attempt_no_sack'
        ],
        default='other'  # safety bucket in case of unexpected combos
    )

    # select/retain useful columns for review and sort by play order.
    cols = [
        'game_id','play_id','qtr','time','posteam','defteam','desc','passer','bucket',
        'complete_pass','incomplete_pass','interception','intentional_grounding','qb_throwaway',
        'qb_spike','sack','penalty','play_type','two_point_attempt','pass_attempt'
    ]
    det = det[[c for c in cols if c in det.columns]].sort_values(['game_id','play_id'])

    # return both the count and the detailed table for auditing
    return len(final_ids), det

# optional: tiny diff helper to spotlight what's being excluded ---
# re-runs the core function with return_details=True and brings back the count and details.
# you can compare det (what was counted) against base week data (what could have been)
# to understand why some plays are excluded by the guards.
def debug_mismatch(df, season, week, qb_name, team):
    # reuse the main function so the logic remains single-sourced and consistent.
    cnt, det = count_qb_pass_attempts_pfr(df, season, week, qb_name, team, return_details=True)

    # for context: all plays in the same (season, week).
    base = df[(df['season']==season) & (df['week']==week)].copy()

    # collect unique (game_id, play_id) included by our logic.
    ids = det[['game_id','play_id']].drop_duplicates()

    # note: At this point you could perform anti-joins (e.g., base IDs not in ids)
    # to inspect *why* certain plays didn't qualify—look for:
    # - two_point_attempts
    # - sacks with no pass attempt flag
    # - offsetting/no play
    # - accepted DPI-like penalties on incompletions (manufactured outcomes)
    # Keeping it minimal to align with your original function scope.
    return cnt, det

In [4]:
# Expect 42
print("Brees 2016 Wk1:", count_qb_pass_attempts_pfr(pbp_py, 2016, 1, "D.Brees", "NO"))
# Expect 31
print("Allen 2022 Wk1:", count_qb_pass_attempts_pfr(pbp_py, 2022, 1, "J.Allen", "BUF"))

Brees 2016 Wk1: 42
Allen 2022 Wk1: 31
