In [None]:
def process_week_data(
    week_number: int,
    plays: pd.DataFrame,
    players: pd.DataFrame | None = None,
    data_root: str = "/content/drive/MyDrive/bdb25-blitz/data/raw",
    out_dir: str  = "/content/drive/MyDrive/bdb25-blitz/artifacts",
    even_frames_only: bool = True,
    label_blitz: bool = True,
    add_disguise: bool = True,
    return_df: bool = False,
):
    """
    Streams week CSV in chunks, keeps all cols, applies cleaning,
    adds LOS/depth/angles, (optional) disguise features, (optional) blitz labels,
    windows to [-0.8s, +0.5s] around snap, writes single Parquet.
    """
    os.makedirs(out_dir, exist_ok=True)
    file_path = os.path.join(data_root, f"tracking_week_{week_number}.csv")
    out_path  = os.path.join(out_dir,   f"week_{week_number:02d}_clean_blitz.parquet")

    # restrict to dropbacks early
    dropbacks = plays.loc[plays["isDropback"] == True, ["gameId","playId","defensiveTeam","possessionTeam"]].drop_duplicates()
    key_set = set(map(tuple, dropbacks[["gameId","playId"]].to_numpy()))
    print(f"[Week {week_number}] scanning for snap frames…")

    # PASS A — collect snap frames
    snap_map = {}
    for chunk in pd.read_csv(file_path, chunksize=1_000_000):
        mask = [tuple(x) in key_set for x in chunk[["gameId","playId"]].to_numpy()]
        if not any(mask): continue
        small = chunk.loc[mask, ["gameId","playId","frameId","event"]]
        snaps = small.loc[small["event"].isin(["ball_snap","ball_snap_penalty"])]
        if not snaps.empty:
            grp = snaps.groupby(["gameId","playId"])["frameId"].min()
            for (g,p), f in grp.items():
                k = (int(g), int(p))
                snap_map[k] = min(f, snap_map.get(k, f))
        del chunk, small, snaps; gc.collect()

    if not snap_map:
        print(f"[Week {week_number}] no dropback snaps found; skipping.")
        return {"week": week_number, "saved": False, "path": None}

    print(f"[Week {week_number}] found {len(snap_map)} snaps. Building cleaned window…")

    # (optional) merge positions for later sim/creeper detection
    players_small = None
    if players is not None and {'nflId','position'}.issubset(players.columns):
        players_small = players[['nflId','position']].drop_duplicates()

    first_write, part_files = True, []

    # PASS B — stream, clean, window, enrich, write
    for chunk in pd.read_csv(file_path, chunksize=600_000):
        mask = [tuple(x) in snap_map for x in chunk[["gameId","playId"]].to_numpy()]
        if not any(mask): continue
        chunk = chunk.loc[mask].copy()

        # ensure dis
        if "x" in chunk.columns and "y" in chunk.columns:
            _ensure_distance_column_inplace(chunk)

        # cleaning
        chunk = rotate_direction_and_orientation(chunk)
        chunk = make_plays_left_to_right(chunk)
        chunk = calculate_velocity_components(chunk)
        chunk = pass_attempt_merging(chunk, plays)

        # add defense flag (like your labelers do)
        chunk = chunk.merge(dropbacks, on=['gameId','playId'], how='left')
        chunk['defense'] = ((chunk['club'] == chunk['defensiveTeam']) & (chunk['club'] != 'football')).astype(int)

        # IDs
        chunk["week"] = week_number
        chunk["uniqueId"] = chunk["gameId"].astype("string") + "_" + chunk["playId"].astype("string")
        chunk["frameUniqueId"] = chunk["uniqueId"] + "_" + chunk["frameId"].astype("string")

        # snap/window
        sf = np.array([snap_map[(g,p)] for g,p in chunk[["gameId","playId"]].to_numpy()], dtype=np.int32)
        chunk["snap_frame"] = sf
        chunk["frames_from_snap"] = chunk["frameId"].astype("int32") - chunk["snap_frame"]
        if even_frames_only:
            chunk = chunk[(chunk["frameId"] % 2) == 0]
        chunk = chunk[(chunk["frames_from_snap"] >= -T_PRE_FRAMES) & (chunk["frames_from_snap"] <= T_POST_FRAMES)]
        if chunk.empty:
            del chunk; gc.collect(); continue

        # LOS/depth/angles
        chunk = add_los_depth_and_angles(chunk)

        # disguise features (pre-snap creep deltas), optional
        if add_disguise:
            chunk = add_disguise_features(chunk, pre_window=(-8, 0))

        # players positions (for sim/creeper). Join once per chunk if provided.
        if players_small is not None and 'position' not in chunk.columns:
            chunk = chunk.merge(players_small, on='nflId', how='left')

        # blitz labels (play-level), optional
        if label_blitz:
            # Make sure we only compute on plays present in this chunk (faster)
            chunk = make_blitz_labels(chunk, players_small)

        # write
        if first_write:
            chunk.to_parquet(out_path, index=False)
            first_write = False
        else:
            tmp = out_path.replace(".parquet", f".part_{np.random.randint(1e9)}.parquet")
            chunk.to_parquet(tmp, index=False)
            part_files.append(tmp)

        del chunk; gc.collect()

    # consolidate parts
    if part_files:
        base = pd.read_parquet(out_path) if os.path.exists(out_path) else None
        dfs = ([base] if base is not None else []) + [pd.read_parquet(p) for p in part_files]
        pd.concat(dfs, ignore_index=True).to_parquet(out_path, index=False)
        for p in part_files:
            try: os.remove(p)
            except: pass

    print(f"[Week {week_number}] saved → {out_path}")
    if return_df:
        return pd.read_parquet(out_path)
    return {"week": week_number, "saved": True, "path": out_path}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from google.colab import drive
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
drive.mount('/content/drive')

# reading static CSV files (currently in GDrive)
games = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/games.csv")
player_play = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/player_play.csv")
players = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/players.csv")
plays = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/plays.csv")

all_weeks = []
for week_number in range(1, 10):
    week_df = process_week_data(
        week_number,
        plays,
        return_df=True,          # <- make it return a DataFrame
        even_frames_only=True
    )
    if isinstance(week_df, dict):  # in case a week was skipped
        continue
    all_weeks.append(week_df)

all_tracking = pd.concat(all_weeks, ignore_index=True)
# your filters
all_tracking = all_tracking[
    (all_tracking['club'] != 'football') & (all_tracking['passAttempt'] == 1)
]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Week 1] scanning for snap frames…
[Week 1] found 1218 snaps. Building cleaned window…
[Week 1] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_01_clean_blitz.parquet
[Week 2] scanning for snap frames…
[Week 2] found 1111 snaps. Building cleaned window…
[Week 2] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_02_clean_blitz.parquet
[Week 3] scanning for snap frames…
[Week 3] found 1223 snaps. Building cleaned window…
[Week 3] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_03_clean_blitz.parquet
[Week 4] scanning for snap frames…
[Week 4] found 1051 snaps. Building cleaned window…
[Week 4] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_04_clean_blitz.parquet
[Week 5] scanning for snap frames…
[Week 5] found 1142 snaps. Building cleaned window…
[Week 5] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_05_clean_

In [None]:
# --- takes ~10mins to run

features = [
    "x_clean","y_clean","v_x","v_y",
    "depth_to_los","o_to_los_cos",
    "creep_depth_mean","creep_lat_mean","pre_speed_mean"
]
target_column = "blitz"

cols_common = [
    "frameUniqueId","displayName","frameId","frameType",
    "club","defensiveTeam","defense", target_column,
    # helpers some packers may touch
    "s_clean","s","dir_clean","o_clean","frames_from_snap"
] + features

def _ensure_basics(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # create 'defense' if absent (defense team & not ball)
    if "defense" not in df.columns:
        if "club" in df.columns and "defensiveTeam" in df.columns:
            df["defense"] = ((df["club"] == df["defensiveTeam"]) & (df["club"] != "football")).astype(int)
        else:
            df["defense"] = 0  # fallback

    # make sure s_clean exists (some helpers use it for speed-derived stats)
    if "s_clean" not in df.columns:
        if "s" in df.columns:
            df["s_clean"] = df["s"]
        else:
            df["s_clean"] = 0.0

    # ensure orientation/dir cleaned if packer glances at them
    if "dir_clean" not in df.columns and "dir" in df.columns:
        df["dir_clean"] = (-(df["dir"] - 90)) % 360
    if "o_clean" not in df.columns and "o" in df.columns:
        df["o_clean"] = (-(df["o"] - 90)) % 360

    # ensure all requested features exist (fill with 0 if missing)
    for c in features:
        if c not in df.columns:
            df[c] = 0.0

    # some packers expect this id
    if "frameUniqueId" not in df.columns:
        df["frameUniqueId"] = (
            df["gameId"].astype(str) + "_" + df["playId"].astype(str) + "_" + df["frameId"].astype(str)
        )

    # keep just what we need (ignore missing safely)
    keep = [c for c in cols_common if c in df.columns]
    return df[keep]

for week_eval in range(1, 10):
    train_df = all_tracking[all_tracking["week"] != week_eval]
    val_df   = all_tracking[all_tracking["week"] == week_eval]

    train_df = _ensure_basics(train_df)
    val_df   = _ensure_basics(val_df)

    # pack tensors (KxF per frame) + targets
    train_features, train_targets = prepare_frame_data_blitz(train_df, features, target_column)
    val_features,   val_targets   = prepare_frame_data_blitz(val_df,   features, target_column)

    if train_features is None or val_features is None:
        raise RuntimeError(
            "prepare_frame_data_blitz returned None (likely inconsistent per-frame shapes). "
            "Ensure the function pads/truncates to a fixed K defenders per frame."
        )

    print(f"Week {week_eval} Tensor: {train_features.shape}")
    print(f"Week {week_eval} Indiv Check: {train_features[63][0]}")

    torch.save(train_features, f"/content/drive/MyDrive/bdb25-blitz/artifacts/features_training_week{week_eval}preds.pt")
    torch.save(train_targets,  f"/content/drive/MyDrive/bdb25-blitz/artifacts/targets_training_week{week_eval}preds.pt")

    torch.save(val_features,   f"/content/drive/MyDrive/bdb25-blitz/artifacts/features_val_week{week_eval}preds.pt")
    torch.save(val_targets,    f"/content/drive/MyDrive/bdb25-blitz/artifacts/targets_val_week{week_eval}preds.pt")


Week 1 Tensor: torch.Size([55738, 8, 9])
Week 1 Indiv Check: tensor([ 6.0450e+01,  2.9160e+01, -5.0208e-01, -3.8055e-01,  6.2000e-01,
        -7.9695e-01, -2.7500e-02, -2.5000e-03,  2.2000e-01])
Week 2 Tensor: torch.Size([56385, 8, 9])
Week 2 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 3 Tensor: torch.Size([55694, 8, 9])
Week 3 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 4 Tensor: torch.Size([56775, 8, 9])
Week 4 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 5 Tensor: torch.Size([56197, 8, 9])
Week 5 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 6 Tensor: torch.Size