# %% [markdown]
# # NFL BDB 2026 – LGBM Inference-Only Submission (Using Saved Models)
#
# This notebook:
# - Loads pretrained LightGBM models from a `models.zip` dataset
# - Reuses the same feature engineering as the training script
# - Implements `predict(test, test_input)` for the Kaggle evaluation server
# - Optionally runs the local gateway to create `submission.csv` for debugging
#
# IMPORTANT:
# - Make sure you've attached **two** datasets in the "Data" tab:
#   1. `nfl-big-data-bowl-2026-prediction`  (competition data)
#   2. Your `models.zip` dataset with the saved LGBM models


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# ======================== IMPORTS & CONFIG ========================
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from lightgbm import LGBMRegressor
import xgboost as xgb
import joblib

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# ================== POST-PROCESSING CONSTANTS ==================
MAX_STEP_ABS = 20.0     # max absolute offset from x_last in yards (per frame)
SMOOTHING_ALPHA = 0.8   # EMA: current frame weight; (1-alpha) for previous frame

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Path to competition data
import sys
DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction"

# Path to your models (as shown in the screenshot)
# dataset slug: "models"
# folder inside: "models"
MODELS_DIR = "/kaggle/input/models2/models"

# Make Kaggle evaluation module importable
sys.path.append(DATA_DIR)
try:
    from kaggle_evaluation.nfl_inference_server import NFLInferenceServer
    HAS_EVAL_SERVER = True
except ModuleNotFoundError:
    NFLInferenceServer = None
    HAS_EVAL_SERVER = False
    print(
        "WARNING: kaggle_evaluation not found. "
        "If you're running locally, this is expected."
    )

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("=" * 70)
print("NFL BIG DATA BOWL 2026 - LGBM Inference-Only (Saved Models)")
print("=" * 70)
print("DEVICE:", DEVICE)
print("MODELS_DIR:", MODELS_DIR)


In [None]:
# %% [code]
# ======================== FEATURE LISTS ===========================
FEATURES = [
    "x_last", "y_last",
    "s", "a", "o", "dir",
    "vx", "vy",
    "ax_comp", "ay_comp",
    "dir_sin", "dir_cos",
    "o_sin", "o_cos",
    "frame_offset", "time_offset",
    "num_frames_output",
    "frac_of_flight",
    "frames_left",
    "time_to_land",
    "remaining_flight_frac",
    "dist_to_ball_land",
    "angle_to_ball_land",
    "dist_to_ball_land_per_frame",
    "cos_dir_to_ball",
    "cos_orient_to_ball",
    "x_rel_ball",
    "y_rel_ball",
    "v_toward_ball",
    "v_across_ball",
    "x_std",
    "ball_land_x_std",
    "dx_to_land_std",
    "dy_to_land",
    "dist_to_sideline",
    "dist_to_center",
    "yardline_100",
    "yardline_norm",
    "dist_to_endzone",
    "dist_to_target_last",
    "dx_to_target_last",
    "dy_to_target_last",
    "angle_to_target",
    "cos_dir_to_target",
    "cos_orient_to_target",
    "v_toward_target",
    "v_across_target",
    "is_target",
    "absolute_yardline_number",
    "player_height", "player_weight",
    "bmi",
    "min_dist_teammate",
    "mean_dist_teammate",
    "min_dist_opponent",
    "mean_dist_opponent",
]

CAT_FEATS = ["player_role", "player_side", "play_direction"]

BASE_COLS = [
    "game_id", "play_id", "nfl_id",
    "x_last", "y_last",
    "s", "a", "o", "dir",
    "player_role", "player_side",
    "num_frames_output",
    "ball_land_x", "ball_land_y",
    "target_last_x", "target_last_y", "target_nfl_id",
    "play_direction",
    "absolute_yardline_number",
    "player_height", "player_weight",
    "player_to_predict",
    "min_dist_teammate",
    "mean_dist_teammate",
    "min_dist_opponent",
    "mean_dist_opponent",
]

# Global LGBM models (will be filled by loader)
LGBM_MODELS_DX = []
LGBM_MODELS_DY = []


# ======================== FEATURE HELPERS =========================
def height_to_inches(ht):
    """Convert '6-2' -> inches."""
    if isinstance(ht, str) and "-" in ht:
        try:
            feet, inches = ht.split("-")
            return int(feet) * 12 + int(inches)
        except Exception:
            return np.nan
    return np.nan


def add_team_distance_features(df_last: pd.DataFrame) -> pd.DataFrame:
    """
    Pairwise distances within (game_id, play_id):
    - min/mean distance to teammates
    - min/mean distance to opponents
    """
    if "player_side" not in df_last.columns:
        df_last["min_dist_teammate"] = 0.0
        df_last["mean_dist_teammate"] = 0.0
        df_last["min_dist_opponent"] = 0.0
        df_last["mean_dist_opponent"] = 0.0
        return df_last

    groups = []
    for (_, _), g in df_last.groupby(["game_id", "play_id"], as_index=False):
        g = g.copy()
        xs = g["x_last"].to_numpy()
        ys = g["y_last"].to_numpy()
        sides = g["player_side"].astype("category").cat.codes.to_numpy()

        dx = xs[:, None] - xs[None, :]
        dy = ys[:, None] - ys[None, :]
        dist = np.sqrt(dx * dx + dy * dy)
        np.fill_diagonal(dist, np.inf)

        same = sides[:, None] == sides[None, :]
        opp = ~same

        dist_tm = np.where(same, dist, np.inf)
        min_dist_tm = dist_tm.min(axis=1)
        min_dist_tm[np.isinf(min_dist_tm)] = 0.0

        sum_tm = np.where(same, dist, 0.0).sum(axis=1)
        cnt_tm = same.sum(axis=1) - 1
        mean_tm = np.divide(
            sum_tm,
            np.maximum(cnt_tm, 1),
            out=np.zeros_like(sum_tm),
            where=cnt_tm > 0,
        )

        dist_op = np.where(opp, dist, np.inf)
        min_dist_op = dist_op.min(axis=1)
        min_dist_op[np.isinf(min_dist_op)] = 0.0

        sum_op = np.where(opp, dist, 0.0).sum(axis=1)
        cnt_op = opp.sum(axis=1)
        mean_op = np.divide(
            sum_op,
            np.maximum(cnt_op, 1),
            out=np.zeros_like(sum_op),
            where=cnt_op > 0,
        )

        g["min_dist_teammate"] = min_dist_tm
        g["mean_dist_teammate"] = mean_tm
        g["min_dist_opponent"] = min_dist_op
        g["mean_dist_opponent"] = mean_op

        groups.append(g)

    return pd.concat(groups, ignore_index=True)


def prepare_last_obs(df: pd.DataFrame) -> pd.DataFrame:
    """
    Last observation per (game_id, play_id, nfl_id),
    rename x,y -> x_last, y_last, convert height,
    add team distance features.
    """
    df_last = (
        df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
          .groupby(["game_id", "play_id", "nfl_id"], as_index=False)
          .last()
    )
    df_last = df_last.rename(columns={"x": "x_last", "y": "y_last"})

    if "player_height" in df_last.columns:
        df_last["player_height"] = df_last["player_height"].apply(height_to_inches)
    else:
        df_last["player_height"] = np.nan

    df_last = add_team_distance_features(df_last)
    return df_last


def add_target_info(df_last: pd.DataFrame) -> pd.DataFrame:
    """
    Attach targeted receiver coords to every row in the play.
    """
    mask_target = df_last.get("player_role", "") == "Targeted Receiver"
    targets = df_last.loc[
        mask_target,
        ["game_id", "play_id", "nfl_id", "x_last", "y_last"],
    ].copy()

    targets = targets.rename(
        columns={
            "nfl_id": "target_nfl_id",
            "x_last": "target_last_x",
            "y_last": "target_last_y",
        }
    )

    df_last = df_last.merge(
        targets[["game_id", "play_id", "target_last_x", "target_last_y", "target_nfl_id"]],
        on=["game_id", "play_id"],
        how="left",
    )
    return df_last


def mirror_raw(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Mirror across field width: y -> 53.3 - y, adjust angles.
    (Not needed in inference; used in training for augmentation.)
    """
    df = df_raw.copy()

    if "y_last" in df.columns:
        df["y_last"] = 53.3 - df["y_last"]
    if "y" in df.columns:
        df["y"] = 53.3 - df["y"]
    if "ball_land_y" in df.columns:
        df["ball_land_y"] = 53.3 - df["ball_land_y"]
    if "target_last_y" in df.columns:
        df["target_last_y"] = 53.3 - df["target_last_y"]

    for ang_col in ["dir", "o"]:
        if ang_col in df.columns:
            df[ang_col] = (-df[ang_col]) % 360.0

    return df


def create_features(df: pd.DataFrame, is_train: bool = False) -> pd.DataFrame:
    """
    Full feature engineering – identical to training script.
    """
    df = df.copy()

    s = df["s"].fillna(0.0)
    a = df["a"].fillna(0.0)
    dir_rad = np.deg2rad(df["dir"].fillna(0.0))
    o_rad = np.deg2rad(df["o"].fillna(0.0))

    df["vx"] = s * np.cos(dir_rad)
    df["vy"] = s * np.sin(dir_rad)
    df["ax_comp"] = a * np.cos(dir_rad)
    df["ay_comp"] = a * np.sin(dir_rad)

    df["dir_sin"] = np.sin(dir_rad)
    df["dir_cos"] = np.cos(dir_rad)
    df["o_sin"] = np.sin(o_rad)
    df["o_cos"] = np.cos(o_rad)

    # frame / time
    if "frame_id" in df.columns:
        df["frame_offset"] = df["frame_id"]
    else:
        df["frame_offset"] = 0

    df["time_offset"] = df["frame_offset"] / 10.0

    if "num_frames_output" in df.columns:
        nfo = df["num_frames_output"].replace(0, np.nan)
        df["frac_of_flight"] = (df["frame_offset"] / nfo).clip(lower=0, upper=1)
        df["frac_of_flight"] = df["frac_of_flight"].fillna(0.0)
        df["frames_left"] = (nfo - df["frame_offset"]).clip(lower=0).fillna(0.0)
    else:
        df["frac_of_flight"] = 0.0
        df["frames_left"] = 0.0

    df["time_to_land"] = df["frames_left"] / 10.0
    df["remaining_flight_frac"] = (1.0 - df["frac_of_flight"]).clip(lower=0.0, upper=1.0)

    # ball landing geometry
    df["dist_to_ball_land"] = np.sqrt(
        (df["ball_land_x"] - df["x_last"]) ** 2 +
        (df["ball_land_y"] - df["y_last"]) ** 2
    )
    df["angle_to_ball_land"] = np.arctan2(
        df["ball_land_y"] - df["y_last"],
        df["ball_land_x"] - df["x_last"],
    )

    frames_left_safe = df["frames_left"].replace(0, np.nan)
    df["dist_to_ball_land_per_frame"] = df["dist_to_ball_land"] / frames_left_safe
    df["dist_to_ball_land_per_frame"] = (
        df["dist_to_ball_land_per_frame"]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0.0)
    )

    df["cos_dir_to_ball"] = np.cos(df["angle_to_ball_land"] - dir_rad)
    df["cos_orient_to_ball"] = np.cos(df["angle_to_ball_land"] - o_rad)

    # direction-standardized coords
    play_dir = df.get("play_direction", "right").fillna("right")
    is_left = (play_dir == "left").astype(int)

    df["x_std"] = np.where(is_left == 1, 120.0 - df["x_last"], df["x_last"])
    df["ball_land_x_std"] = np.where(
        is_left == 1, 120.0 - df["ball_land_x"], df["ball_land_x"]
    )

    df["dx_to_land_std"] = df["ball_land_x_std"] - df["x_std"]
    df["dy_to_land"] = df["ball_land_y"] - df["y_last"]

    # field position
    df["dist_to_sideline"] = np.minimum(df["y_last"], 53.3 - df["y_last"])
    df["dist_to_center"] = np.abs(df["y_last"] - 53.3 / 2.0)

    yard = df["absolute_yardline_number"].fillna(50.0)
    yard_100 = yard.clip(lower=0.0, upper=100.0)
    df["yardline_100"] = yard_100
    df["yardline_norm"] = yard_100 / 100.0
    df["dist_to_endzone"] = 100.0 - yard_100

    # target receiver geometry
    df["dist_to_target_last"] = np.sqrt(
        (df["target_last_x"] - df["x_last"]) ** 2 +
        (df["target_last_y"] - df["y_last"]) ** 2
    )
    df["dx_to_target_last"] = df["target_last_x"] - df["x_last"]
    df["dy_to_target_last"] = df["target_last_y"] - df["y_last"]
    df["angle_to_target"] = np.arctan2(
        df["target_last_y"] - df["y_last"],
        df["target_last_x"] - df["x_last"],
    )

    df["cos_dir_to_target"] = np.cos(df["angle_to_target"] - dir_rad)
    df["cos_orient_to_target"] = np.cos(df["angle_to_target"] - o_rad)

    df["is_target"] = (df["nfl_id"] == df["target_nfl_id"]).astype(int)

    # relative coords & velocity projections
    df["x_rel_ball"] = df["x_last"] - df["ball_land_x"]
    df["y_rel_ball"] = df["y_last"] - df["ball_land_y"]

    vx = df["vx"]
    vy = df["vy"]

    ball_cos = np.cos(df["angle_to_ball_land"])
    ball_sin = np.sin(df["angle_to_ball_land"])
    df["v_toward_ball"] = vx * ball_cos + vy * ball_sin
    df["v_across_ball"] = vx * (-ball_sin) + vy * ball_cos

    tgt_cos = np.cos(df["angle_to_target"])
    tgt_sin = np.sin(df["angle_to_target"])
    df["v_toward_target"] = vx * tgt_cos + vy * tgt_sin
    df["v_across_target"] = vx * (-tgt_sin) + vy * tgt_cos

    df[["v_toward_ball", "v_across_ball", "v_toward_target", "v_across_target"]] = (
        df[["v_toward_ball", "v_across_ball", "v_toward_target", "v_across_target"]]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0.0)
    )

    # player physics
    h = df["player_height"].replace(0, np.nan)
    w = df["player_weight"].replace(0, np.nan)
    df["bmi"] = 703.0 * (w / (h ** 2))
    df["bmi"] = df["bmi"].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    # targets only in train (for compatibility; in inference we don't use dx/dy)
    if is_train:
        df["dx"] = df["x"] - df["x_last"]
        df["dy"] = df["y"] - df["y_last"]

    return df


In [None]:
# %% [code] cell 4
def prepare_inference_batch(test_pd: pd.DataFrame, test_input_pd: pd.DataFrame) -> pd.DataFrame:
    """
    Build rows for inference:
    - last observation per (game_id, play_id, nfl_id) from test_input
    - attach target receiver info and pairwise distances
    - merge with test (id, game_id, play_id, nfl_id, frame_id)
    - create features (same as training)
    """
    last_obs = prepare_last_obs(test_input_pd)
    last_obs = add_target_info(last_obs)

    cols_to_keep_existing = [c for c in BASE_COLS if c in last_obs.columns]

    test_rows = test_pd.merge(
        last_obs[cols_to_keep_existing],
        on=["game_id", "play_id", "nfl_id"],
        how="left",
    )

    test_rows = create_features(test_rows, is_train=False)
    return test_rows


In [None]:
# %% [code]
# ======================== LOAD SAVED LGBM MODELS =========================
import glob
import joblib
from pathlib import Path

def load_lgbm_models():
    """
    Load pretrained LGBM dx/dy models directly from MODELS_DIR.

    Expected files (as in your dataset screenshot):

      /kaggle/input/models/models/
        lgbm_dx_0.pkl
        lgbm_dx_1.pkl
        lgbm_dy_0.pkl
        lgbm_dy_1.pkl
        lgbm_dy_2.pkl
        (plus gnn_model.pth, xgb_dx_0.pkl, xgb_dy_0.pkl, meta.pkl)

    Only the LGBM files are used for this submission.
    """
    global LGBM_MODELS_DX, LGBM_MODELS_DY

    models_path = Path(MODELS_DIR)
    if not models_path.exists():
        raise FileNotFoundError(
            f"{models_path} does not exist. "
            "Check that the 'models' dataset is attached and the path is correct."
        )

    # Find dx/dy model files
    dx_files = sorted(glob.glob(str(models_path / "lgbm_dx_*.pkl")))
    dy_files = sorted(glob.glob(str(models_path / "lgbm_dy_*.pkl")))

    if not dx_files or not dy_files:
        raise FileNotFoundError(
            f"No LGBM dx/dy model files found in {models_path}.\n"
            f"Found: {[p.name for p in models_path.iterdir()]}"
        )

    print("Found LGBM dx model files:", dx_files)
    print("Found LGBM dy model files:", dy_files)

    # Load all models for ensemble
    LGBM_MODELS_DX = [joblib.load(f) for f in dx_files]
    LGBM_MODELS_DY = [joblib.load(f) for f in dy_files]

    print(
        f"Loaded {len(LGBM_MODELS_DX)} dx models and "
        f"{len(LGBM_MODELS_DY)} dy models."
    )


In [None]:
def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame:
    """
    Main predict() used by the Kaggle evaluation server.

    - Uses saved LGBM models (dx, dy) to predict offsets from x_last, y_last
    - Applies simple post-processing:
        * clip dx, dy to avoid extreme jumps
        * smooth x, y over frame_id within each (game_id, play_id, nfl_id)
    - Returns Polars DataFrame with columns ["x", "y"] in the same row order as `test`.
    """
    assert LGBM_MODELS_DX and LGBM_MODELS_DY, "LGBM models are not loaded"

    # Convert to pandas for feature pipeline
    test_pd = test.to_pandas()
    test_input_pd = test_input.to_pandas()

    # Build feature rows (same as in training)
    test_rows = prepare_inference_batch(test_pd, test_input_pd)

    # Ensure all features exist
    for col in FEATURES:
        if col not in test_rows.columns:
            test_rows[col] = 0.0
    for c in CAT_FEATS:
        if c not in test_rows.columns:
            test_rows[c] = "unknown"

    # Categorical handling – same as training
    X_tree = test_rows[FEATURES + CAT_FEATS].copy()
    for c in CAT_FEATS:
        X_tree[c] = X_tree[c].astype("category")

    # ---- LGBM ensemble predictions for dx, dy ----
    pred_dx_tree_list = [m.predict(X_tree) for m in LGBM_MODELS_DX]
    pred_dy_tree_list = [m.predict(X_tree) for m in LGBM_MODELS_DY]
    dx_pred = np.mean(pred_dx_tree_list, axis=0)
    dy_pred = np.mean(pred_dy_tree_list, axis=0)

    # ---- 1) Clip dx, dy to avoid extreme offsets ----
    dx_pred = np.clip(dx_pred, -MAX_STEP_ABS, MAX_STEP_ABS)
    dy_pred = np.clip(dy_pred, -MAX_STEP_ABS, MAX_STEP_ABS)

    # ---- 2) Convert to raw x, y (before smoothing) ----
    x_raw = test_rows["x_last"].to_numpy() + dx_pred
    y_raw = test_rows["y_last"].to_numpy() + dy_pred

    # Clip to field bounds
    x_raw = np.clip(x_raw, 0.0, 120.0)
    y_raw = np.clip(y_raw, 0.0, 53.3)

    # Put into a DataFrame with keys & frame_id for smoothing
    out_df = pd.DataFrame({
        "game_id": test_rows["game_id"].values,
        "play_id": test_rows["play_id"].values,
        "nfl_id":  test_rows["nfl_id"].values,
        "frame_id": test_rows.get("frame_id", pd.Series(0, index=test_rows.index)).values,
        "x_raw": x_raw,
        "y_raw": y_raw,
    })
    out_df["row_idx"] = np.arange(len(out_df))  # to restore original order later

    # ---- 3) Smooth trajectories over time per (game, play, player) ----
    # Sort by grouping keys + frame_id
    out_df_sorted = out_df.sort_values(
        ["game_id", "play_id", "nfl_id", "frame_id", "row_idx"]
    ).reset_index(drop=True)

    x_smooth = out_df_sorted["x_raw"].to_numpy().copy()
    y_smooth = out_df_sorted["y_raw"].to_numpy().copy()

    # EMA smoothing within each group
    group_keys = ["game_id", "play_id", "nfl_id"]
    group_offsets = (
        out_df_sorted.groupby(group_keys, sort=False).indices.values()
    )

    # group_offsets is a dict-like, so we iterate properly
    for idxs in out_df_sorted.groupby(group_keys, sort=False).indices.values():
        # idxs is a numpy array of row indices for this group, already sorted by frame_id
        if len(idxs) <= 1:
            continue
        for i in range(1, len(idxs)):
            cur = idxs[i]
            prev = idxs[i - 1]
            x_smooth[cur] = (
                SMOOTHING_ALPHA * x_smooth[cur] + (1.0 - SMOOTHING_ALPHA) * x_smooth[prev]
            )
            y_smooth[cur] = (
                SMOOTHING_ALPHA * y_smooth[cur] + (1.0 - SMOOTHING_ALPHA) * y_smooth[prev]
            )

    # Attach smoothed coords
    out_df_sorted["x_smooth"] = x_smooth
    out_df_sorted["y_smooth"] = y_smooth

    # Bring back to original row order using row_idx
    out_final = (
        out_df_sorted
        .sort_values("row_idx")
        .reset_index(drop=True)
    )

    # Final x, y (smoothed, within bounds)
    x_final = np.clip(out_final["x_smooth"].to_numpy(), 0.0, 120.0)
    y_final = np.clip(out_final["y_smooth"].to_numpy(), 0.0, 53.3)

    # Return as Polars DataFrame in original order
    return pl.DataFrame({"x": x_final, "y": y_final})


In [None]:
# %% [code]
# ======================== MAIN (INFERENCE ONLY) ==========================
if __name__ == "__main__":
    # 1) Load pretrained LGBM models from MODELS_DIR
    load_lgbm_models()

    # 2) Run Kaggle evaluation server if available
    if HAS_EVAL_SERVER and NFLInferenceServer is not None:
        inference_server = NFLInferenceServer(predict)

        # On Kaggle's actual scoring runs
        if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
            inference_server.serve()
        else:
            # Local gateway on public mock test to create submission.csv
            print("\n[LOCAL] Running local gateway to generate submission.csv...")
            inference_server.run_local_gateway((DATA_DIR,))
            print("✓ submission.csv should now be in the working directory")
    else:
        print(
            "\nNFLInferenceServer is not available (kaggle_evaluation not found). "
            "Training & debugging is fine, but to generate a real submission.csv "
            "you must run this notebook on Kaggle with the competition data attached."
        )
