In [1]:
# === Requirements ===
# pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "/Users/shouryasoni/Desktop/AI Data Lab/nfl_kick_attempts(in).csv"
USE_ONLY_FIELD_GOALS = True         # exclude extra points from EPA calc
MIN_ROOKIE_ATTEMPTS = 1             # set to 10+ if you want stability filtering
DISTANCE_BINS = [0, 19, 29, 39, 49, np.inf]  # yards
MIN_ROOKIE_ATTEMPTS = 20 
# League-average FG make% by bin (very rough priors; adjust if you have better rates)
BIN_FG_PROB = [0.99, 0.95, 0.87, 0.75, 0.60]
BIN_EP = [p * 3.0 for p in BIN_FG_PROB]      # Expected points = P(make)*3

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Basic hygiene
# Ensure lowercase standardized column names if needed (uncomment if your file differs)
# df.columns = [c.strip().lower() for c in df.columns]

# Keep only rows with a numeric distance (needed for EP by distance)
df = df.copy()
df = df[pd.to_numeric(df["kick_distance"], errors="coerce").notna()]
df["kick_distance"] = df["kick_distance"].astype(float)

# Optionally keep only field-goal attempts (exclude extra points)
# Assumes extra_point column is 1 for XP, 0 for not XP; adjust if needed
if USE_ONLY_FIELD_GOALS and "extra_point" in df.columns:
    df = df[df["extra_point"] == 0].copy()

# -----------------------------
# Per-kick EPA
# -----------------------------
# Bin distance and map to expected points
df["distance_bin"] = pd.cut(
    df["kick_distance"], bins=DISTANCE_BINS, right=True, labels=False, include_lowest=True
)

# If a kick falls outside bins, set to NaN (shouldn't happen with np.inf)
df["expected_points"] = df["distance_bin"].apply(
    lambda ix: BIN_EP[int(ix)] if pd.notnull(ix) else np.nan
)

# Actual points: 3 for made FG, else 0
# (for extra points you’d use 1 point if you decide to include them; we excluded XP above)
df["actual_points"] = (df["made"].astype(int) * 3).astype(float)

# EPA = actual - expected
df["epa"] = df["actual_points"] - df["expected_points"]

# -----------------------------
# Rookie-season identification
# -----------------------------
# First season a kicker appears in the data is treated as rookie season
first_season = (
    df.groupby("kicker_player_id")["season"]
    .min()
    .rename("rookie_season")
    .reset_index()
)

df = df.merge(first_season, on="kicker_player_id", how="left")
rookie_df = df[df["season"] == df["rookie_season"]].copy()

# -----------------------------
# Rookie-year feature engineering
# -----------------------------
# Precompute masks for feature rates
rookie_df["is_clutch"] = rookie_df["score_differential"].abs() <= 3
rookie_df["is_50_plus"] = rookie_df["kick_distance"] >= 50

agg = rookie_df.groupby(["kicker_player_id", "kicker_player_name", "rookie_season"]).agg(
    attempts=("made", "size"),
    makes=("made", "sum"),
    avg_distance=("kick_distance", "mean"),
    long_attempts=("is_50_plus", "sum"),
    long_makes=("made", lambda x: x[rookie_df.loc[x.index, "is_50_plus"]].sum()),
    clutch_attempts=("is_clutch", "sum"),
    clutch_makes=("made", lambda x: x[rookie_df.loc[x.index, "is_clutch"]].sum()),
    total_epa=("epa", "sum"),
)

# Rates
agg["fg_pct"] = np.where(agg["attempts"] > 0, agg["makes"] / agg["attempts"], np.nan)
agg["epa_per_kick"] = np.where(agg["attempts"] > 0, agg["total_epa"] / agg["attempts"], np.nan)
agg["long_fg_pct"] = np.where(agg["long_attempts"] > 0, agg["long_makes"] / agg["long_attempts"], np.nan)
agg["clutch_fg_pct"] = np.where(agg["clutch_attempts"] > 0, agg["clutch_makes"] / agg["clutch_attempts"], np.nan)

rookies = agg.reset_index()

# Optional stability filter (e.g., keep rookies with >= MIN_ROOKIE_ATTEMPTS attempts)
rookies = rookies[rookies["attempts"] >= MIN_ROOKIE_ATTEMPTS].copy()

# -----------------------------
# Define success label
# -----------------------------
# Success = above-median rookie EPA per kick (change to total_epa if you prefer)
median_epa_per_kick = rookies["epa_per_kick"].median(skipna=True)
rookies["successful"] = (rookies["epa_per_kick"] >= median_epa_per_kick).astype(int)

# -----------------------------
# Modeling dataset
# -----------------------------
feature_cols = ["fg_pct", "avg_distance", "long_fg_pct", "clutch_fg_pct", "attempts"]

# Replace inf/NaN in features (simple imputation: fill NaN long/clutch pct with overall fg_pct or 0)
X = rookies[feature_cols].copy()
# For missing long/clutch %, fall back to fg_pct or 0 if fg_pct also missing
X["long_fg_pct"] = X["long_fg_pct"].fillna(X["fg_pct"])
X["clutch_fg_pct"] = X["clutch_fg_pct"].fillna(X["fg_pct"])
X = X.fillna(0.0)

y = rookies["successful"].astype(int)

# Train/test split for quick validation of the signal
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# -----------------------------
# Logistic Regression
# -----------------------------
# Use liblinear for small datasets; increase C if you want less regularization
clf = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000)
clf.fit(X_train, y_train)

y_prob = clf.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("=== Classification Report (threshold=0.5) ===")
print(classification_report(y_test, y_pred, digits=3))
try:
    print("AUC:", roc_auc_score(y_test, y_prob))
except Exception:
    pass

# -----------------------------
# Coefficients (feature importance in log-odds space)
# -----------------------------
coef_df = pd.DataFrame(
    {"feature": feature_cols, "coef": clf.coef_.ravel()}
).sort_values("coef", ascending=False)
print("\n=== Logistic Regression Coefficients (log-odds) ===")
print(coef_df.to_string(index=False))

# -----------------------------
# Attach predicted success probability to each rookie (optional)
# -----------------------------
rookies_out = rookies.copy()
rookies_out["p_success"] = clf.predict_proba(X)[:, 1]

# Top and bottom examples (optional)
print("\n=== Top 10 rookies by predicted success probability ===")
print(
    rookies_out.sort_values("p_success", ascending=False)[
        ["kicker_player_name", "rookie_season", "epa_per_kick", "fg_pct", "avg_distance",
         "long_fg_pct", "clutch_fg_pct", "attempts", "p_success", "successful"]
    ].head(10).to_string(index=False)
)

print("\n=== Bottom 10 rookies by predicted success probability ===")
print(
    rookies_out.sort_values("p_success", ascending=True)[
        ["kicker_player_name", "rookie_season", "epa_per_kick", "fg_pct", "avg_distance",
         "long_fg_pct", "clutch_fg_pct", "attempts", "p_success", "successful"]
    ].head(10).to_string(index=False)
)


=== Classification Report (threshold=0.5) ===
              precision    recall  f1-score   support

           0      0.778     0.538     0.636        13
           1      0.647     0.846     0.733        13

    accuracy                          0.692        26
   macro avg      0.712     0.692     0.685        26
weighted avg      0.712     0.692     0.685        26

AUC: 0.7218934911242604

=== Logistic Regression Coefficients (log-odds) ===
      feature      coef
       fg_pct  0.729196
clutch_fg_pct  0.709451
  long_fg_pct  0.513463
     attempts  0.078067
 avg_distance -0.070252

=== Top 10 rookies by predicted success probability ===
kicker_player_name  rookie_season  epa_per_kick   fg_pct  avg_distance  long_fg_pct  clutch_fg_pct  attempts  p_success  successful
            O.Mare           1999      0.076304 0.847826     36.869565          0.6       0.826087        46   0.808676           1
          H.Butker           2017      0.257857 0.904762     37.214286          0.8  