In [1]:
# =========================================
# FULL NFL 4TH DOWN MODEL + BROWNS WEEK 1 (2025) ANALYSIS
# =========================================

import os
import numpy as np
import pandas as pd
import nfl_data_py as nfl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# CONFIG / CACHING
# -----------------------------
CACHE_DIR = "nfl_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
TRAIN_YEARS = list(range(2018, 2025))  # 2018–2024 inclusive
PBP_CACHE = os.path.join(CACHE_DIR, "pbp_2018_2024.pkl")
PBP25_CACHE = os.path.join(CACHE_DIR, "pbp_2025.pkl")

# -----------------------------
# LOAD TRAINING DATA (2018–2024)
# -----------------------------
if os.path.exists(PBP_CACHE):
    pbp = pd.read_pickle(PBP_CACHE)
    print(f"Loaded {len(pbp):,} plays (2018–2024) from cache.")
else:
    print("Downloading 2018–2024 PBP…")
    pbp = nfl.import_pbp_data(years=TRAIN_YEARS, cache=False)
    pbp.to_pickle(PBP_CACHE)
    print(f"Saved 2018–2024 cache -> {PBP_CACHE} ({len(pbp):,} rows)")

# -----------------------------
# FEATURE SETS (DEF, TENDENCIES, KICKERS)
# -----------------------------
print("\nBuilding feature sets…")

# Defensive metrics
def_epa = pbp[pbp['defteam'].notna()].groupby('defteam')['epa'].mean()

fd = pbp[(pbp['down'] == 4) & pbp['defteam'].notna()].copy()
fd['stopped'] = (fd['fourth_down_failed'] == 1).astype(int)
def_4th_stop = fd.groupby('defteam')['stopped'].mean()

rz = pbp[(pbp['yardline_100'] <= 20) & pbp['defteam'].notna()].copy()
def_rz_td = rz.groupby('defteam')['touchdown'].mean()

td = pbp[(pbp['down'] == 3) & pbp['defteam'].notna()].copy()
td['stopped'] = (td['third_down_failed'] == 1).astype(int)
def_3rd_stop = td.groupby('defteam')['stopped'].mean()

# Team tendencies
t4 = pbp[
    (pbp['down'] == 4) &
    (pbp['ydstogo'] <= 5) &
    (pbp['yardline_100'] <= 40) &
    (pbp['play_type'].notna())
].copy()
t4['decision'] = 'other'
t4.loc[t4['play_type'].isin(['run', 'pass']), 'decision'] = 'go'
t4.loc[t4['play_type'] == 'field_goal', 'decision'] = 'fg'
t4.loc[t4['play_type'] == 'punt', 'decision'] = 'punt'

team_rates = t4.groupby(['posteam', 'decision']).size().unstack(fill_value=0)
team_rates['total'] = team_rates.sum(axis=1)
team_rates['go_rate'] = team_rates.get('go', 0) / team_rates['total']
team_rates['fg_preference'] = team_rates.get('fg', 0) / (team_rates.get('fg', 0) + team_rates.get('go', 0) + 0.001)
team_tendencies = team_rates[['go_rate', 'fg_preference']]

# Offense strength (EPA by offense)
team_off = pbp.groupby('posteam')['epa'].mean()

# Kicker profiles
fg_data = pbp[pbp['field_goal_attempt'] == 1].copy()
fg_data['made'] = (fg_data['field_goal_result'] == 'made').astype(int)
fg_data['dist_bin'] = pd.cut(
    fg_data['kick_distance'],
    bins=[0, 30, 40, 50, 60, 100],
    labels=['<30', '30-39', '40-49', '50-59', '60+']
)

kicker_by_dist = fg_data.groupby(['kicker_player_name', 'dist_bin'])['made'].agg(['mean', 'count'])
kicker_overall = fg_data.groupby('kicker_player_name')['made'].mean()

# 'game_date' exists in nflfastR; if missing, .rolling() will yield NaN and we fallback later.
fg_data = fg_data.sort_values(['kicker_player_name', 'game_date'])
fg_data['recent_form'] = fg_data.groupby('kicker_player_name')['made'].transform(
    lambda x: x.rolling(20, min_periods=5).mean().shift(1)
)
kicker_recent = fg_data.groupby('kicker_player_name')['recent_form'].last()

def get_kicker_features(name, dist):
    """Return (distance_bin_accuracy, overall_acc, recent_form) with sensible fallbacks."""
    bin_lbl = ('<30' if dist < 30 else
               '30-39' if dist < 40 else
               '40-49' if dist < 50 else
               '50-59' if dist < 60 else '60+')
    dist_acc = 0.85
    if (name, bin_lbl) in kicker_by_dist.index:
        stats = kicker_by_dist.loc[(name, bin_lbl)]
        if stats['count'] >= 3:
            dist_acc = float(stats['mean'])
    return dist_acc, float(kicker_overall.get(name, 0.85)), float(kicker_recent.get(name, 0.85))

# -----------------------------
# GAME CONTEXT HELPERS
# -----------------------------
def categorize_game_state(row):
    sd = (row.get('score_differential', 0) or 0)
    q = row.get('qtr', 1)
    t = row.get('game_seconds_remaining', 3600)
    if q == 4 and t < 120:
        if abs(sd) <= 3: return 'clutch_close'
        if sd < -8:     return 'desperation'
        if sd > 8:      return 'clock_kill'
    if abs(sd) <= 7:    return 'competitive'
    if sd > 14:         return 'comfortable_lead'
    if sd < -14:        return 'big_deficit'
    return 'moderate_diff'

def add_context(df):
    df = df.copy()
    df['game_state']  = df.apply(categorize_game_state, axis=1)
    df['two_minute']  = ((df['qtr'].isin([2, 4])) & (df['quarter_seconds_remaining'] < 120)).astype(int)
    df['overtime']    = (df['qtr'] > 4).astype(int)
    return df

def _score_bucket(sd):
    if pd.isna(sd): return "close"
    if sd < -7: return "down_td"
    if sd < -3: return "down_fg"
    if sd <= 3: return "close"
    if sd <= 7: return "up_fg"
    return "up_td"

# -----------------------------
# TRAIN ENHANCED FG MODEL
# -----------------------------
print("\nTraining FG model…")
fg = pbp[pbp.get('field_goal_attempt', 0) == 1].copy()
fg['made'] = (fg['field_goal_result'] == 'made').astype(int)

surf = fg['surface'].fillna('').str.lower()
fg['surface_type'] = np.where(surf.str.contains('turf', case=False, na=False), 'turf', 'grass')
fg['indoor'] = fg['roof'].fillna('').str.lower().isin(['dome', 'closed']).astype(int)
fg['home_game'] = (fg['posteam'] == fg['home_team']).astype(int)
fg['time_pressure'] = ((fg['qtr'] >= 4) & (fg['quarter_seconds_remaining'] < 300)).astype(int)
fg['desperation_time'] = ((fg['qtr'] >= 4) & (fg['quarter_seconds_remaining'] < 120)).astype(int)

fg['distance_cat'] = pd.cut(
    fg['kick_distance'].fillna(40),
    bins=[0, 35, 45, 55, 100],
    labels=['short', 'medium', 'long', 'very_long']
)
fg['score_situation'] = pd.cut(
    fg['score_differential'].fillna(0),
    bins=[-100, -7, -3, 3, 7, 100],
    labels=['down_td', 'down_fg', 'close', 'up_fg', 'up_td']
)

fg = add_context(fg)
fg['opp_def_epa']       = fg['defteam'].map(def_epa).fillna(0)
fg['opp_4th_stop_rate'] = fg['defteam'].map(def_4th_stop).fillna(0.75)

kf = fg.apply(lambda x: get_kicker_features(x.get('kicker_player_name', ''), x.get('kick_distance', 40)), axis=1, result_type='expand')
fg[['kicker_dist_acc', 'kicker_overall_acc', 'kicker_recent_form']] = kf
fg['team_fg_preference'] = fg['posteam'].map(team_tendencies['fg_preference']).fillna(0.70)

FG_FEATURES = [
    "kick_distance", "indoor", "surface_type", "home_game",
    "time_pressure", "desperation_time", "score_situation", "distance_cat",
    "kicker_dist_acc", "kicker_overall_acc", "kicker_recent_form",
    "opp_def_epa", "opp_4th_stop_rate",
    "game_state", "two_minute", "overtime",
    "team_fg_preference"
]
fg_X = fg[FG_FEATURES].copy()
fg_y = fg["made"].astype(int)
fg_num = ["kick_distance","kicker_dist_acc","kicker_overall_acc","kicker_recent_form","opp_def_epa","opp_4th_stop_rate","team_fg_preference"]
fg_cat = [c for c in FG_FEATURES if c not in fg_num]

fg_ct = ColumnTransformer(transformers=[
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), fg_num),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), fg_cat),
])
fg_model = Pipeline(steps=[
    ("prep", fg_ct),
    ("clf", RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_leaf=5, random_state=42))
]).fit(fg_X, fg_y)

fg_brier = brier_score_loss(fg_y, fg_model.predict_proba(fg_X)[:, 1])
fg_auc   = roc_auc_score(fg_y, fg_model.predict_proba(fg_X)[:, 1])
print(f"FG model: Brier={fg_brier:.3f}, AUC={fg_auc:.3f}")

# -----------------------------
# TRAIN ENHANCED GO MODEL
# -----------------------------
print("Training Go-for-it model…")
go = pbp[(pbp["down"] == 4) & pbp["ydstogo"].notna() & pbp["yardline_100"].notna()].copy()
go["converted"] = ((go["first_down"] == 1) | (go["touchdown"] == 1)).astype(int)

go["home_game"] = (go["posteam"] == go["home_team"]).astype(int)
go["time_pressure"] = ((go["qtr"] >= 4) & (go["quarter_seconds_remaining"] < 300)).astype(int)
go["desperation"] = (go["score_differential"].fillna(0) < -7).astype(int)
go["short_yardage"] = (go["ydstogo"] <= 2).astype(int)
go["goal_line"] = (go["yardline_100"] <= 5).astype(int)
go["off_strength"] = go["posteam"].map(team_off).fillna(0)

go = add_context(go)
go["opp_def_epa"]       = go["defteam"].map(def_epa).fillna(0)
go["opp_4th_stop_rate"] = go["defteam"].map(def_4th_stop).fillna(0.75)
go["opp_rz_td_allowed"] = go["defteam"].map(def_rz_td).fillna(0.30)
go["opp_3rd_stop_rate"] = go["defteam"].map(def_3rd_stop).fillna(0.65)
go["team_go_tendency"]  = go["posteam"].map(team_tendencies["go_rate"]).fillna(0.15)

GO_FEATURES = [
    "ydstogo", "yardline_100", "score_differential", "qtr", "quarter_seconds_remaining",
    "short_yardage", "goal_line", "home_game", "time_pressure", "desperation", "off_strength",
    "opp_def_epa", "opp_4th_stop_rate", "opp_rz_td_allowed", "opp_3rd_stop_rate",
    "game_state", "two_minute", "overtime",
    "team_go_tendency"
]
go_X = go[GO_FEATURES].copy()
go_y = go["converted"].astype(int)
go_num = ["ydstogo","yardline_100","score_differential","quarter_seconds_remaining","off_strength",
          "opp_def_epa","opp_4th_stop_rate","opp_rz_td_allowed","opp_3rd_stop_rate","team_go_tendency"]
go_cat = [c for c in GO_FEATURES if c not in go_num]

go_ct = ColumnTransformer(transformers=[
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), go_num),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), go_cat),
])
go_model = Pipeline(steps=[
    ("prep", go_ct),
    ("clf", RandomForestClassifier(n_estimators=500, max_depth=12, min_samples_leaf=5, random_state=42))
]).fit(go_X, go_y)

go_brier = brier_score_loss(go_y, go_model.predict_proba(go_X)[:, 1])
go_auc   = roc_auc_score(go_y, go_model.predict_proba(go_X)[:, 1])
print(f"Go model: Brier={go_brier:.3f}, AUC={go_auc:.3f}")

# -----------------------------
# RECOMMENDER
# -----------------------------
TD_POINTS = 6.94  # expected points for a successful go (drive ends in TD-ish)

def recommend_enhanced(play_row):
    """Return (choice, p_fg, p_go, ep_fg, ep_go) for a single PBP row."""
    yl  = float(play_row.get("yardline_100", np.nan))
    ytg = float(play_row.get("ydstogo", np.nan))
    if np.isnan(yl) or np.isnan(ytg):
        raise ValueError("yardline_100 and ydstogo are required")

    kd = yl + 17
    kname = play_row.get("kicker_player_name", "")
    if not kname and play_row.get("posteam") == "CLE":
        kname = "A.Szmyt"  # fallback
    dist_acc, overall_acc, recent_form = get_kicker_features(kname, kd)

    posteam = play_row.get("posteam", "CLE")
    defteam = play_row.get("defteam", "")

    fg_dict = {
        "kick_distance": kd,
        "indoor": int(str(play_row.get("roof", "")).lower() in ["dome", "closed"]),
        "surface_type": "turf" if "turf" in str(play_row.get("surface", "")).lower() else "grass",
        "home_game": int(play_row.get("posteam") == play_row.get("home_team")),
        "time_pressure": int((play_row.get("qtr", 1) >= 4) and (play_row.get("quarter_seconds_remaining", 900) < 300)),
        "desperation_time": int((play_row.get("qtr", 1) >= 4) and (play_row.get("quarter_seconds_remaining", 900) < 120)),
        "score_situation": _score_bucket(play_row.get("score_differential", 0)),
        "distance_cat": ("short" if kd <= 35 else "medium" if kd <= 45 else "long" if kd <= 55 else "very_long"),
        "kicker_dist_acc": dist_acc,
        "kicker_overall_acc": overall_acc,
        "kicker_recent_form": recent_form,
        "opp_def_epa": def_epa.get(defteam, 0),
        "opp_4th_stop_rate": def_4th_stop.get(defteam, 0.75),
        "game_state": categorize_game_state(play_row),
        "two_minute": int((play_row.get("qtr", 1) in [2, 4]) and (play_row.get("quarter_seconds_remaining", 900) < 120)),
        "overtime": int(play_row.get("qtr", 1) > 4),
        "team_fg_preference": team_tendencies.loc[posteam, 'fg_preference'] if posteam in team_tendencies.index else 0.70
    }

    go_dict = {
        "ydstogo": ytg,
        "yardline_100": yl,
        "score_differential": float(play_row.get("score_differential", 0) or 0),
        "qtr": float(play_row.get("qtr", 1) or 1),
        "quarter_seconds_remaining": float(play_row.get("quarter_seconds_remaining", 900) or 900),
        "short_yardage": int(ytg <= 2),
        "goal_line": int(yl <= 5),
        "home_game": int(play_row.get("posteam") == play_row.get("home_team")),
        "time_pressure": int((play_row.get("qtr", 1) >= 4) and (play_row.get("quarter_seconds_remaining", 900) < 300)),
        "desperation": int(float(play_row.get("score_differential", 0) or 0) < -7),
        "off_strength": team_off.get(posteam, 0.0),
        "opp_def_epa": def_epa.get(defteam, 0),
        "opp_4th_stop_rate": def_4th_stop.get(defteam, 0.75),
        "opp_rz_td_allowed": def_rz_td.get(defteam, 0.30),
        "opp_3rd_stop_rate": def_3rd_stop.get(defteam, 0.65),
        "game_state": categorize_game_state(play_row),
        "two_minute": int((play_row.get("qtr", 1) in [2, 4]) and (play_row.get("quarter_seconds_remaining", 900) < 120)),
        "overtime": int(play_row.get("qtr", 1) > 4),
        "team_go_tendency": team_tendencies.loc[posteam, 'go_rate'] if posteam in team_tendencies.index else 0.15
    }

    p_fg = float(fg_model.predict_proba(pd.DataFrame([fg_dict]))[0, 1])
    p_go = float(go_model.predict_proba(pd.DataFrame([go_dict]))[0, 1])
    ep_fg, ep_go = 3.0 * p_fg, TD_POINTS * p_go
    choice = "Kick FG" if ep_fg >= ep_go else "Go for it"
    return choice, p_fg, p_go, ep_fg, ep_go

# -----------------------------
# LOAD 2025 & RUN WEEK 1 (Browns)
# -----------------------------
if os.path.exists(PBP25_CACHE):
    pbp_2025 = pd.read_pickle(PBP25_CACHE)
    print("\nLoaded 2025 PBP from cache.")
else:
    print("\nDownloading 2025 PBP…")
    pbp_2025 = nfl.import_pbp_data(years=[2025], cache=False)
    pbp_2025.to_pickle(PBP25_CACHE)
    print(f"Saved 2025 cache -> {PBP25_CACHE} ({len(pbp_2025):,} rows)")

wk1 = pbp_2025[(pbp_2025['posteam'] == 'CLE') & (pbp_2025['week'] == 1)].copy()

# Opponent (for header)
opp = None
if len(wk1) > 0:
    home = wk1['home_team'].iloc[0]
    away = wk1['away_team'].iloc[0]
    opp = away if home == 'CLE' else home

wk1 = wk1[(wk1['down'] == 4) & wk1['yardline_100'].notna()].copy()
wk1['kick_distance'] = wk1['yardline_100'] + 17
wk1 = wk1[(wk1['kick_distance'] <= 60) & (wk1['play_type'] != 'punt')].copy()

rows = []
for _, p in wk1.iterrows():
    choice, p_fg, p_go, ep_fg, ep_go = recommend_enhanced(p)
    actual = "Kick FG" if p.get("field_goal_attempt", 0) == 1 else "Go for it"
    pts = 3 if p.get("field_goal_result") == "made" else (7 if ((p.get("first_down", 0) == 1) or (p.get("touchdown", 0) == 1)) else 0)

    qsr = int(p.get('quarter_seconds_remaining', 0) or 0)
    mm, ss = qsr // 60, qsr % 60

    rows.append({
        "qtr": int(p.get("qtr", 0) or 0),
        "time": f"{mm}:{ss:02d}",
        "score_diff": int(p.get("score_differential", 0) or 0),
        "ytg": int(p.get("ydstogo", 0) or 0),
        "yd_line": int(p.get("yardline_100", 0) or 0),
        "kick_dist": int(p.get("kick_distance", 0) or 0),
        "model_rec": choice,
        "actual": actual,
        "P(FG)": p_fg,
        "P(Go)": p_go,
        "EP_FG": ep_fg,
        "EP_Go": ep_go,
        "actual_pts": int(pts)
    })

results_df = pd.DataFrame(rows)

# -----------------------------
# OUTPUT
# -----------------------------
print("\n" + "-"*72)
print(f"Browns — Week 1, 2025 vs {opp or 'Unknown Opponent'}")
print("-"*72)

if len(results_df) == 0:
    print("No eligible 4th-down plays within the ≤60-yard FG window.")
else:
    # Compact per-decision lines
    for i, r in results_df.iterrows():
        print(
            f"Decision {i+1}: "
            f"Q{int(r['qtr'])} {r['time']} | "
            f"4th & {r['ytg']} at {r['yd_line']} (FG {r['kick_dist']} yds) | "
            f"Model: {r['model_rec']} | Browns: {r['actual']} | "
            f"P(FG) {r['P(FG)']:.1%}→{r['EP_FG']:.2f} EP, "
            f"P(Go) {r['P(Go)']:.1%}→{r['EP_Go']:.2f} EP | "
            f"Pts: {r['actual_pts']}"
        )

    # Summary
    agree = (results_df["model_rec"] == results_df["actual"]).mean()
    optimal_ep = results_df[["EP_FG", "EP_Go"]].max(axis=1).mean()
    actual_avg = results_df["actual_pts"].mean()

    print("\nSummary")
    print("-------")
    print(f"Agreement: {agree:.1%}")
    print(f"Model EP (avg best of FG/Go): {optimal_ep:.2f}")
    print(f"Actual points (avg):         {actual_avg:.2f}")
    print(f"Points left on field:        {optimal_ep - actual_avg:.2f}")

    # Clean table for copy/paste
    table = results_df.copy()
    table["P(FG)"] = (table["P(FG)"] * 100).round(1)
    table["P(Go)"] = (table["P(Go)"] * 100).round(1)
    table["EP_FG"] = table["EP_FG"].round(2)
    table["EP_Go"] = table["EP_Go"].round(2)

    print("\nPlay Summary (copy-friendly)")
    print("----------------------------")
    print(
        table[["qtr","time","ytg","yd_line","kick_dist","model_rec","actual","actual_pts","EP_FG","EP_Go","P(FG)","P(Go)"]]
        .rename(columns={
            "yd_line": "Yardline", "kick_dist": "FG Dist",
            "model_rec": "Model", "actual": "Browns", "actual_pts": "Pts"
        })
        .to_string(index=False)
    )


Loaded 340,587 plays (2018–2024) from cache.

Building feature sets…

Training FG model…
FG model: Brier=0.080, AUC=0.943
Training Go-for-it model…
Go model: Brier=0.055, AUC=0.948

Loaded 2025 PBP from cache.

------------------------------------------------------------------------
Browns — Week 1, 2025 vs CIN
------------------------------------------------------------------------
Decision 1: Q2 12:44 | 4th & 1 at 1 (FG 18 yds) | Model: Go for it | Browns: Go for it | P(FG) 93.2%→2.79 EP, P(Go) 50.7%→3.52 EP | Pts: 7
Decision 2: Q2 0:27 | 4th & 5 at 27 (FG 44 yds) | Model: Kick FG | Browns: Kick FG | P(FG) 88.6%→2.66 EP, P(Go) 6.0%→0.42 EP | Pts: 3
Decision 3: Q4 14:16 | 4th & 5 at 39 (FG 56 yds) | Model: Kick FG | Browns: Go for it | P(FG) 67.1%→2.01 EP, P(Go) 7.6%→0.52 EP | Pts: 0
Decision 4: Q4 2:25 | 4th & 15 at 18 (FG 35 yds) | Model: Kick FG | Browns: Kick FG | P(FG) 87.1%→2.61 EP, P(Go) 4.0%→0.28 EP | Pts: 0

Summary
-------
Agreement: 75.0%
Model EP (avg best of FG/Go): 2.70
