In [26]:
def rotate_direction_and_orientation(df):

  """
  Rotate the direction and orientation angles so that 0° points from left to right on the field, and increasing angle goes counterclockwise
  This should be done BEFORE the call to make_plays_left_to_right, because that function with compensate for the flipped angles.

  :param df: the aggregate dataframe created using the aggregate_data() method

  :return df: the aggregate dataframe with orientation and direction angles rotated 90° clockwise
  """

  df["o_clean"] = (-(df["o"] - 90)) % 360
  df["dir_clean"] = (-(df["dir"] - 90)) % 360

  return df


def make_plays_left_to_right(df):

  """
  Flip tracking data so that all plays run from left to right. The new x, y, s, a, dis, o, and dir data
  will be stored in new columns with the suffix "_clean" even if the variables do not change from their original value.

  :param df: the aggregate dataframe created using the aggregate_data() method

  :return df: the aggregate dataframe with the new columns such that all plays run left to right
  """

  df["x_clean"] = np.where(
      df["playDirection"] == "left",
      120 - df["x"],
      df[
          "x"
      ],  # 120 because the endzones (10 yds each) are included in the ["x"] values
  )

  df["y_clean"] = df["y"]
  df["s_clean"] = df["s"]
  df["a_clean"] = df["a"]
  df["dis_clean"] = df["dis"]

  df["o_clean"] = np.where(
      df["playDirection"] == "left", 180 - df["o_clean"], df["o_clean"]
  )

  df["o_clean"] = (df["o_clean"] + 360) % 360  # remove negative angles

  df["dir_clean"] = np.where(
      df["playDirection"] == "left", 180 - df["dir_clean"], df["dir_clean"]
  )

  df["dir_clean"] = (df["dir_clean"] + 360) % 360  # remove negative angles

  return df


import numpy as np

def calculate_velocity_components(df):
    """
    Calculate the velocity components (v_x and v_y) for each row in the dataframe.

    :param df: the aggregate dataframe with "_clean" columns created using make_plays_left_to_right()

    :return df: the dataframe with additional columns 'v_x' and 'v_y' representing the velocity components
    """

    df["dir_radians"] = np.radians(df["dir_clean"])

    df["v_x"] = df["s_clean"] * np.cos(df["dir_radians"])
    df["v_y"] = df["s_clean"] * np.sin(df["dir_radians"])


    return df


def label_offense_defense_coverage(presnap_df, plays_df):

  coverage_replacements = {
    'Cover-3 Cloud Right': 'Cover-3',
    'Cover-3 Cloud Left': 'Cover-3',
    'Cover-3 Seam': 'Cover-3',
    'Cover-3 Double Cloud': 'Cover-3',
    'Cover-6 Right': 'Cover-6',
    'Cover 6-Left': 'Cover-6',
    'Cover-1 Double': 'Cover-1'}

  values_to_drop = ["Miscellaneous", "Bracket", "Prevent", "Red Zone", "Goal Line"]

  plays_df['pff_passCoverage'] = plays_df['pff_passCoverage'].replace(coverage_replacements)

  plays_df = plays_df.dropna(subset=['pff_passCoverage'])
  plays_df = plays_df[~plays_df['pff_passCoverage'].isin(values_to_drop)]

  coverage_mapping = {
      'Cover-0': 0,
      'Cover-1': 1,
      'Cover-2': 2,
      'Cover-3': 3,
      'Quarters': 4,
      '2-Man': 5,
      'Cover-6': 6
  }

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'pff_passCoverage']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['pff_passCoverage'] = merged_df['pff_passCoverage'].map(coverage_mapping)
  merged_df.dropna(subset=['pff_passCoverage'], inplace=True)

  return merged_df


def label_offense_defense_manzone(presnap_df, plays_df):

  plays_df = plays_df.dropna(subset=['pff_manZone'])

  coverage_mapping = {
      'Zone': 0,
      'Man': 1}

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'pff_manZone']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['pff_manZone'] = merged_df['pff_manZone'].map(coverage_mapping)
  merged_df.dropna(subset=['pff_manZone'], inplace=True)

  return merged_df


def label_offense_defense_formation(presnap_df, plays_df):

  """
  Adds 'offense' and 'defense' columns to presnap_df, marking players as offense (1) or defense (0)
  based on possession team and defensive team from plays_df. Enumerates offensive formations
  and removes rows with missing formations.

  Parameters:
  presnap_df (pd.DataFrame): DataFrame containing tracking data with 'gameId', 'playId', and 'club'.
  plays_df (pd.DataFrame): DataFrame containing 'gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'offenseFormation'.

  Returns:
  pd.DataFrame: Updated presnap_df with added 'offense', 'defense', and enumerated 'offenseFormation' columns, with NaN formations dropped.
  """

  formation_mapping = {
      'EMPTY': 0,
      'I_FORM': 1,
      'JUMBO': 2,
      'PISTOL': 3,
      'SHOTGUN': 4,
      'SINGLEBACK': 5,
      'WILDCAT': 6
  }

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'offenseFormation']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['offenseFormation'] = merged_df['offenseFormation'].map(formation_mapping)
  merged_df.dropna(subset=['offenseFormation'], inplace=True)

  return merged_df


import pandas as pd
import numpy as np

def split_data_by_uniqueId(df, train_ratio=0.7, test_ratio=0.15, val_ratio=0.15, unique_id_column="uniqueId"):

  """
  Split the dataframe into training, testing, and validation sets based on a given ratio while
  ensuring all rows with the same uniqueId are in the same set.

  :param df: the aggregate dataframe containing all frames for each play
  :param train_ratio: proportion of the data to allocate to training (default 0.7)
  :param test_ratio: proportion of the data to allocate to testing (default 0.15)
  :param val_ratio: proportion of the data to allocate to validation (default 0.15)
  :param unique_id_column: the name of the column containing the unique identifiers for each play

  :return: three dataframes (train_df, test_df, val_df) for training, testing, and validation
  """

  unique_ids = df[unique_id_column].unique()
  np.random.shuffle(unique_ids)

  num_ids = len(unique_ids)
  train_end = int(train_ratio * num_ids)
  test_end = train_end + int(test_ratio * num_ids)

  train_ids = unique_ids[:train_end]
  test_ids = unique_ids[train_end:test_end]
  val_ids = unique_ids[test_end:]

  train_df = df[df[unique_id_column].isin(train_ids)]
  test_df = df[df[unique_id_column].isin(test_ids)]
  val_df = df[df[unique_id_column].isin(val_ids)]

  print(f"Train Dataframe Frames: {train_df.shape[0]}")
  print(f"Test Dataframe Frames: {test_df.shape[0]}")
  print(f"Val Dataframe Frames: {val_df.shape[0]}")

  return train_df, test_df, val_df


def pass_attempt_merging(tracking, plays):

  plays['passAttempt'] = np.where(plays['passResult'].isin([np.nan, 'S']), 0, 1)

  plays_for_merge = plays[['gameId', 'playId', 'passAttempt']]

  merged_df = tracking.merge(
      plays_for_merge,
      on=['gameId', 'playId'],
      how='left')

  return merged_df


#def prepare_frame_data(df, features, target_column):

 # features_array = df.groupby("frameUniqueId")[features].apply(
  #    lambda x: x.to_numpy(dtype=np.float32)).to_numpy()

#  try:
#      features_tensor = torch.tensor(np.stack(features_array))
#  except ValueError as e:
 #     print("Skipping batch due to inconsistent shapes in features_array:", e)
 #     return None, None  # or return some placeholder values if needed

#  targets_array = df.groupby("frameUniqueId")[target_column].first().to_numpy()
 # targets_tensor = torch.tensor(targets_array, dtype=torch.long)

 # return features_tensor, targets_tensor


def select_augmented_frames(df, num_samples, sigma=5):

    df_frames = df[['frameUniqueId', 'frames_from_snap']].drop_duplicates()
    weights = np.exp(-((df_frames['frames_from_snap'] + 10) ** 2) / (2 * sigma ** 2))

    weights /= weights.sum()

    selected_frames = np.random.choice(
        df_frames['frameUniqueId'], size=num_samples, replace=False, p=weights
    )

    return selected_frames


def data_augmentation(df, augmented_frames):

  df_sample = df.loc[df['frameUniqueId'].isin(augmented_frames)].copy()

  df_sample['y_clean'] = (160 / 3) - df_sample['y_clean']
  df_sample['dir_radians'] = (2 * np.pi) - df_sample['dir_radians']
  df_sample['dir_clean'] = np.degrees(df_sample['dir_radians'])

  df_sample['frameUniqueId'] = df_sample['frameUniqueId'].astype(str) + '_aug'

  return df_sample

In [27]:
def add_los_depth_and_angles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Robustly compute LOS (los_x) per (gameId, playId) using the frame CLOSEST to the snap.
    Priority:
      1) ball row (club == 'football') at |frames_from_snap| minimum
      2) offense rows at that frame: min x_clean (offense moves L->R)
      3) fallback: per-play min x_clean
    Then add:
      - depth_to_los = x_clean - los_x
      - o_to_los_cos = cos(dir_clean in radians)
    """
    # ensure we have frames_from_snap and x_clean/y_clean/dir_clean
    if "frames_from_snap" not in df.columns:
        raise KeyError("frames_from_snap not found. Compute snap_frame and frames_from_snap before LOS.")
    if "x_clean" not in df.columns or "dir_clean" not in df.columns:
        raise KeyError("x_clean/dir_clean not found. Run make_plays_left_to_right and rotate_direction_and_orientation first.")

    # find the frameId closest to snap per play
    closest_idx = (
        df.loc[:, ["gameId","playId","frameId","frames_from_snap"]]
          .assign(abs_fs=lambda x: x["frames_from_snap"].abs())
          .sort_values(["gameId","playId","abs_fs","frameId"])
          .groupby(["gameId","playId"], as_index=False)
          .first()[["gameId","playId","frameId"]]
          .rename(columns={"frameId": "closest_frame"})
    )
    df = df.merge(closest_idx, on=["gameId","playId"], how="left")

    # 1) Try ball at closest frame
    ball = (
        df[(df["club"] == "football") & (df["frameId"] == df["closest_frame"])]
        .loc[:, ["gameId","playId","x_clean"]]
        .drop_duplicates()
        .rename(columns={"x_clean": "los_x"})
    )

    # 2) Offense at closest frame (defense==0 and not football), take min x_clean
    off_at_closest = (
        df[(df["frameId"] == df["closest_frame"]) & (df["club"] != "football")]
    )
    if "defense" in off_at_closest.columns:
        off_at_closest = off_at_closest[off_at_closest["defense"] == 0]

    off_min = (
        off_at_closest.groupby(["gameId","playId"], as_index=False)["x_clean"]
        .min()
        .rename(columns={"x_clean": "los_x_off"})
    )

    # merge LOS candidates
    df = df.merge(ball, on=["gameId","playId"], how="left")
    df = df.merge(off_min, on=["gameId","playId"], how="left")

    # 3) final los_x: prefer ball, else offense min at closest, else per-play min x_clean
    per_play_min = (
        df.groupby(["gameId","playId"], as_index=False)["x_clean"].min()
          .rename(columns={"x_clean": "los_x_fallback"})
    )
    df = df.merge(per_play_min, on=["gameId","playId"], how="left")

    df["los_x"] = df["los_x"].fillna(df["los_x_off"])
    df["los_x"] = df["los_x"].fillna(df["los_x_fallback"])
    df.drop(columns=["los_x_off","los_x_fallback","closest_frame"], inplace=True, errors="ignore")

    # add depth/angle features
    df["depth_to_los"] = df["x_clean"] - df["los_x"]
    df["o_to_los_cos"] = np.cos(np.radians(df["dir_clean"]))
    return df


In [28]:
import os, gc, numpy as np, pandas as pd

# Window around snap (tracking is ~10 Hz)
HZ = 10
T_PRE_S, T_POST_S = 0.8, 0.5
T_PRE_FRAMES  = int(T_PRE_S  * HZ)   # 8 frames pre
T_POST_FRAMES = int(T_POST_S * HZ)   # 5 frames post

def _ensure_distance_column_inplace(df: pd.DataFrame) -> None:
    """If 'dis' is missing, compute distance moved since previous frame per (gameId, playId, nflId)."""
    if "dis" in df.columns:
        return
    df.sort_values(["gameId","playId","nflId","frameId"], inplace=True)
    same_entity = (
        df["gameId"].diff().eq(0) &
        df["playId"].diff().eq(0) &
        df["nflId"].diff().eq(0)
    )
    dx = df["x"].diff()
    dy = df["y"].diff()
    df["dis"] = np.where(same_entity, np.sqrt(dx*dx + dy*dy), 0.0).astype("float64")

In [29]:
CROSS_TMAX_FRAMES = 13   # ~1.3s after snap at 10 Hz
DL_POSITIONS = {'DT','NT','DE','EDGE','DI'}  # adjust to your position codes

def _first_cross_flags(gdf):
    """Return set of nflIds that cross LOS within CROSS_TMAX_FRAMES after snap."""
    snapF = int(gdf['snap_frame'].iloc[0])
    los_x = float(gdf['los_x'].iloc[0])
    def_team = gdf['defensiveTeam'].iloc[0]
    d = gdf[(gdf['club']==def_team) & (gdf['nflId'].notna())].copy()
    window = d[(d['frameId'] >= snapF) & (d['frameId'] <= snapF + CROSS_TMAX_FRAMES)]
    # Offense is moving L->R; a defender "rusher" is someone whose x_clean <= los_x at any time in window
    crossed = (window.groupby('nflId')['x_clean'].apply(lambda x: (x <= los_x).any()))
    return set(crossed[crossed].index.astype(np.int64))

def make_blitz_labels(play_df, players_df=None):
    """
    Returns a play-level labels DataFrame with columns:
      blitz (0/1), num_rushers, sim_blitz (0/1)
    and merges back to play_df.
    """
    pos_map = {}
    if players_df is not None and {'nflId','position'}.issubset(players_df.columns):
        pos_map = players_df[['nflId','position']].drop_duplicates().set_index('nflId')['position'].to_dict()

    recs = []
    for (g,p), gdf in play_df.groupby(['gameId','playId']):
        rushers = _first_cross_flags(gdf)
        num_rush = len(rushers)
        blitz = int(num_rush >= 5)

        # creeper/simulated: 4 rushers AND any non-DL rushed AND any DL dropped ≥3y behind LOS in first 0.8s
        sim_blitz = 0
        if num_rush == 4 and pos_map:
            non_dl_rushed = any(pos_map.get(nid, '') not in DL_POSITIONS for nid in rushers)
            snapF = int(gdf['snap_frame'].iloc[0])
            los_x = float(gdf['los_x'].iloc[0])
            dl = gdf[(gdf['club']==gdf['defensiveTeam'].iloc[0]) &
                     (gdf['position'].isin(DL_POSITIONS)) &
                     (gdf['frameId'] >= snapF) & (gdf['frameId'] <= snapF+8)].copy()
            dl['depth'] = dl['x_clean'] - los_x
            dl_drop = (dl.groupby('nflId')['depth'].max() >= 3.0).any() if not dl.empty else False
            sim_blitz = int(non_dl_rushed and dl_drop)

        recs.append((g,p,blitz,num_rush,sim_blitz))

    lab = pd.DataFrame(recs, columns=['gameId','playId','blitz','num_rushers','sim_blitz'])
    return play_df.merge(lab, on=['gameId','playId'], how='left')


In [30]:
def add_disguise_features(df: pd.DataFrame, pre_window=(-8, 0)) -> pd.DataFrame:
    lo, hi = pre_window
    pre = df[(df['frames_from_snap'] >= lo) & (df['frames_from_snap'] <= hi)].copy()
    if pre.empty:
        for c in ['creep_depth_mean','creep_depth_max','creep_lat_mean','creep_lat_max',
                  'pre_speed_mean','pre_speed_max','pre_face_cos','pre_depth_mean']:
            df[c] = 0.0
        return df

    pre.sort_values(['gameId','playId','nflId','frameId'], inplace=True)
    pre['delta_depth'] = pre.groupby(['gameId','playId','nflId'])['depth_to_los'].diff().fillna(0.0)
    pre['delta_lat']   = pre.groupby(['gameId','playId','nflId'])['y_clean'].diff().fillna(0.0)

    agg = pre.groupby(['gameId','playId','nflId']).agg(
        creep_depth_mean=('delta_depth','mean'),
        creep_depth_max =('delta_depth','max'),
        creep_lat_mean  =('delta_lat','mean'),
        creep_lat_max   =('delta_lat','max'),
        pre_speed_mean  =('s_clean','mean') if 's_clean' in pre.columns else ('s','mean'),
        pre_speed_max   =('s_clean','max') if 's_clean' in pre.columns else ('s','max'),
        pre_face_cos    =('o_to_los_cos','mean'),
        pre_depth_mean  =('depth_to_los','mean'),
    ).reset_index()

    return df.merge(agg, on=['gameId','playId','nflId'], how='left')

In [31]:
import torch

def _threat_score(sub):
    # nearer LOS (smaller |depth|), moving toward LOS (+cos), faster
    return -np.abs(sub['depth_to_los']) + 0.4*sub['s_clean'] + 0.2*sub['o_to_los_cos']

def prepare_frame_data_blitz(df, features, target_column, K=8):
    """
    Builds tensors:
      X: [N_frames, K, F]  (defense-only, top-K by threat, zero-padded)
      y: [N_frames]        (play-level blitz label broadcast to frames)
    """
    # Defense only and not the ball
    df_def = df[(df['defense']==1) & (df['club']!='football')].copy()

    # play-level labels once per frameUniqueId
    y_map = df_def.groupby('frameUniqueId')[target_column].first().astype(int).to_dict()

    X_list, y_list = [], []
    for fuid, sub in df_def.groupby('frameUniqueId'):
        sub = sub.copy()
        # robust: if any feature missing, fill 0
        for c in features:
            if c not in sub.columns:
                sub[c] = 0.0

        # rank by threat within this frame
        sub['threat'] = _threat_score(sub)
        sub.sort_values('threat', ascending=False, inplace=True)
        mat = sub[features].to_numpy(dtype=np.float32)

        # pad/truncate to K
        if mat.shape[0] >= K:
            matK = mat[:K]
        else:
            pad = np.zeros((K - mat.shape[0], mat.shape[1]), dtype=np.float32)
            matK = np.vstack([mat, pad])

        X_list.append(matK)
        y_list.append(y_map.get(fuid, 0))

    if not X_list:
        return None, None

    X = torch.tensor(np.stack(X_list))   # [N, K, F]
    y = torch.tensor(np.array(y_list), dtype=torch.long)
    return X, y

In [32]:
def process_week_data(
    week_number: int,
    plays: pd.DataFrame,
    players: pd.DataFrame | None = None,
    data_root: str = "/content/drive/MyDrive/bdb25-blitz/data/raw",
    out_dir: str  = "/content/drive/MyDrive/bdb25-blitz/artifacts",
    even_frames_only: bool = True,
    label_blitz: bool = True,
    add_disguise: bool = True,
    return_df: bool = False,
):
    """
    Streams week CSV in chunks, keeps all cols, applies cleaning,
    adds LOS/depth/angles, (optional) disguise features, (optional) blitz labels,
    windows to [-0.8s, +0.5s] around snap, writes single Parquet.
    """
    os.makedirs(out_dir, exist_ok=True)
    file_path = os.path.join(data_root, f"tracking_week_{week_number}.csv")
    out_path  = os.path.join(out_dir,   f"week_{week_number:02d}_clean_blitz.parquet")

    # restrict to dropbacks early
    dropbacks = plays.loc[plays["isDropback"] == True, ["gameId","playId","defensiveTeam","possessionTeam"]].drop_duplicates()
    key_set = set(map(tuple, dropbacks[["gameId","playId"]].to_numpy()))
    print(f"[Week {week_number}] scanning for snap frames…")

    # PASS A — collect snap frames
    snap_map = {}
    for chunk in pd.read_csv(file_path, chunksize=1_000_000):
        mask = [tuple(x) in key_set for x in chunk[["gameId","playId"]].to_numpy()]
        if not any(mask): continue
        small = chunk.loc[mask, ["gameId","playId","frameId","event"]]
        snaps = small.loc[small["event"].isin(["ball_snap","ball_snap_penalty"])]
        if not snaps.empty:
            grp = snaps.groupby(["gameId","playId"])["frameId"].min()
            for (g,p), f in grp.items():
                k = (int(g), int(p))
                snap_map[k] = min(f, snap_map.get(k, f))
        del chunk, small, snaps; gc.collect()

    if not snap_map:
        print(f"[Week {week_number}] no dropback snaps found; skipping.")
        return {"week": week_number, "saved": False, "path": None}

    print(f"[Week {week_number}] found {len(snap_map)} snaps. Building cleaned window…")

    # (optional) merge positions for later sim/creeper detection
    players_small = None
    if players is not None and {'nflId','position'}.issubset(players.columns):
        players_small = players[['nflId','position']].drop_duplicates()

    first_write, part_files = True, []

    # PASS B — stream, clean, window, enrich, write
    for chunk in pd.read_csv(file_path, chunksize=600_000):
        mask = [tuple(x) in snap_map for x in chunk[["gameId","playId"]].to_numpy()]
        if not any(mask): continue
        chunk = chunk.loc[mask].copy()

        # ensure dis
        if "x" in chunk.columns and "y" in chunk.columns:
            _ensure_distance_column_inplace(chunk)

        # cleaning
        chunk = rotate_direction_and_orientation(chunk)
        chunk = make_plays_left_to_right(chunk)
        chunk = calculate_velocity_components(chunk)
        chunk = pass_attempt_merging(chunk, plays)

        # add defense flag (like your labelers do)
        chunk = chunk.merge(dropbacks, on=['gameId','playId'], how='left')
        chunk['defense'] = ((chunk['club'] == chunk['defensiveTeam']) & (chunk['club'] != 'football')).astype(int)

        # IDs
        chunk["week"] = week_number
        chunk["uniqueId"] = chunk["gameId"].astype("string") + "_" + chunk["playId"].astype("string")
        chunk["frameUniqueId"] = chunk["uniqueId"] + "_" + chunk["frameId"].astype("string")

        # snap/window
        sf = np.array([snap_map[(g,p)] for g,p in chunk[["gameId","playId"]].to_numpy()], dtype=np.int32)
        chunk["snap_frame"] = sf
        chunk["frames_from_snap"] = chunk["frameId"].astype("int32") - chunk["snap_frame"]
        if even_frames_only:
            chunk = chunk[(chunk["frameId"] % 2) == 0]
        chunk = chunk[(chunk["frames_from_snap"] >= -T_PRE_FRAMES) & (chunk["frames_from_snap"] <= T_POST_FRAMES)]
        if chunk.empty:
            del chunk; gc.collect(); continue

        # LOS/depth/angles
        chunk = add_los_depth_and_angles(chunk)

        # disguise features (pre-snap creep deltas), optional
        if add_disguise:
            chunk = add_disguise_features(chunk, pre_window=(-8, 0))

        # players positions (for sim/creeper). Join once per chunk if provided.
        if players_small is not None and 'position' not in chunk.columns:
            chunk = chunk.merge(players_small, on='nflId', how='left')

        # blitz labels (play-level), optional
        if label_blitz:
            # Make sure we only compute on plays present in this chunk (faster)
            chunk = make_blitz_labels(chunk, players_small)

        # write
        if first_write:
            chunk.to_parquet(out_path, index=False)
            first_write = False
        else:
            tmp = out_path.replace(".parquet", f".part_{np.random.randint(1e9)}.parquet")
            chunk.to_parquet(tmp, index=False)
            part_files.append(tmp)

        del chunk; gc.collect()

    # consolidate parts
    if part_files:
        base = pd.read_parquet(out_path) if os.path.exists(out_path) else None
        dfs = ([base] if base is not None else []) + [pd.read_parquet(p) for p in part_files]
        pd.concat(dfs, ignore_index=True).to_parquet(out_path, index=False)
        for p in part_files:
            try: os.remove(p)
            except: pass

    print(f"[Week {week_number}] saved → {out_path}")
    if return_df:
        return pd.read_parquet(out_path)
    return {"week": week_number, "saved": True, "path": out_path}

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from google.colab import drive
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
drive.mount('/content/drive')

# reading static CSV files (currently in GDrive)
games = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/games.csv")
player_play = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/player_play.csv")
players = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/players.csv")
plays = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/plays.csv")

all_weeks = []
for week_number in range(1, 10):
    week_df = process_week_data(
        week_number,
        plays,
        return_df=True,          # <- make it return a DataFrame
        even_frames_only=True
    )
    if isinstance(week_df, dict):  # in case a week was skipped
        continue
    all_weeks.append(week_df)

all_tracking = pd.concat(all_weeks, ignore_index=True)
# your filters
all_tracking = all_tracking[
    (all_tracking['club'] != 'football') & (all_tracking['passAttempt'] == 1)
]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Week 1] scanning for snap frames…
[Week 1] found 1218 snaps. Building cleaned window…
[Week 1] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_01_clean_blitz.parquet
[Week 2] scanning for snap frames…
[Week 2] found 1111 snaps. Building cleaned window…
[Week 2] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_02_clean_blitz.parquet
[Week 3] scanning for snap frames…
[Week 3] found 1223 snaps. Building cleaned window…
[Week 3] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_03_clean_blitz.parquet
[Week 4] scanning for snap frames…
[Week 4] found 1051 snaps. Building cleaned window…
[Week 4] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_04_clean_blitz.parquet
[Week 5] scanning for snap frames…
[Week 5] found 1142 snaps. Building cleaned window…
[Week 5] saved → /content/drive/MyDrive/bdb25-blitz/artifacts/week_05_clean_

In [38]:
# --- takes ~10mins to run

features = [
    "x_clean","y_clean","v_x","v_y",
    "depth_to_los","o_to_los_cos",
    "creep_depth_mean","creep_lat_mean","pre_speed_mean"
]
target_column = "blitz"

cols_common = [
    "frameUniqueId","displayName","frameId","frameType",
    "club","defensiveTeam","defense", target_column,
    # helpers some packers may touch
    "s_clean","s","dir_clean","o_clean","frames_from_snap"
] + features

def _ensure_basics(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # create 'defense' if absent (defense team & not ball)
    if "defense" not in df.columns:
        if "club" in df.columns and "defensiveTeam" in df.columns:
            df["defense"] = ((df["club"] == df["defensiveTeam"]) & (df["club"] != "football")).astype(int)
        else:
            df["defense"] = 0  # fallback

    # make sure s_clean exists (some helpers use it for speed-derived stats)
    if "s_clean" not in df.columns:
        if "s" in df.columns:
            df["s_clean"] = df["s"]
        else:
            df["s_clean"] = 0.0

    # ensure orientation/dir cleaned if packer glances at them
    if "dir_clean" not in df.columns and "dir" in df.columns:
        df["dir_clean"] = (-(df["dir"] - 90)) % 360
    if "o_clean" not in df.columns and "o" in df.columns:
        df["o_clean"] = (-(df["o"] - 90)) % 360

    # ensure all requested features exist (fill with 0 if missing)
    for c in features:
        if c not in df.columns:
            df[c] = 0.0

    # some packers expect this id
    if "frameUniqueId" not in df.columns:
        df["frameUniqueId"] = (
            df["gameId"].astype(str) + "_" + df["playId"].astype(str) + "_" + df["frameId"].astype(str)
        )

    # keep just what we need (ignore missing safely)
    keep = [c for c in cols_common if c in df.columns]
    return df[keep]

for week_eval in range(1, 10):
    train_df = all_tracking[all_tracking["week"] != week_eval]
    val_df   = all_tracking[all_tracking["week"] == week_eval]

    train_df = _ensure_basics(train_df)
    val_df   = _ensure_basics(val_df)

    # pack tensors (KxF per frame) + targets
    train_features, train_targets = prepare_frame_data_blitz(train_df, features, target_column)
    val_features,   val_targets   = prepare_frame_data_blitz(val_df,   features, target_column)

    if train_features is None or val_features is None:
        raise RuntimeError(
            "prepare_frame_data_blitz returned None (likely inconsistent per-frame shapes). "
            "Ensure the function pads/truncates to a fixed K defenders per frame."
        )

    print(f"Week {week_eval} Tensor: {train_features.shape}")
    print(f"Week {week_eval} Indiv Check: {train_features[63][0]}")

    torch.save(train_features, f"/content/drive/MyDrive/bdb25-blitz/artifacts/features_training_week{week_eval}preds.pt")
    torch.save(train_targets,  f"/content/drive/MyDrive/bdb25-blitz/artifacts/targets_training_week{week_eval}preds.pt")

    torch.save(val_features,   f"/content/drive/MyDrive/bdb25-blitz/artifacts/features_val_week{week_eval}preds.pt")
    torch.save(val_targets,    f"/content/drive/MyDrive/bdb25-blitz/artifacts/targets_val_week{week_eval}preds.pt")


Week 1 Tensor: torch.Size([55738, 8, 9])
Week 1 Indiv Check: tensor([ 6.0450e+01,  2.9160e+01, -5.0208e-01, -3.8055e-01,  6.2000e-01,
        -7.9695e-01, -2.7500e-02, -2.5000e-03,  2.2000e-01])
Week 2 Tensor: torch.Size([56385, 8, 9])
Week 2 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 3 Tensor: torch.Size([55694, 8, 9])
Week 3 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 4 Tensor: torch.Size([56775, 8, 9])
Week 4 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 5 Tensor: torch.Size([56197, 8, 9])
Week 5 Indiv Check: tensor([ 1.0586e+02,  2.2150e+01, -7.8419e-02,  1.2787e-01,  8.0000e-01,
        -5.2280e-01, -7.5000e-03,  4.5000e-02,  2.7750e-01])
Week 6 Tensor: torch.Size

In [39]:
import os, random, pandas as pd, torch
from torch.utils.data import TensorDataset

# ---- config ----
PROJ = "/content/drive/MyDrive/bdb25-blitz"
ART  = f"{PROJ}/artifacts"   # where week_XX_clean_blitz.parquet lives
os.makedirs(ART, exist_ok=True)

# your new feature set + target
features = ["x_clean","y_clean","v_x","v_y","depth_to_los","o_to_los_cos",
            "creep_depth_mean","creep_lat_mean","pre_speed_mean"]
target_column = "blitz"

# K defenders kept per frame inside prepare_frame_data_blitz (must match your helper default)
K_DEF = 8

# minimal columns we’ll require from parquet (don’t drop anything else while writing parquet)
NEEDED = ["frameUniqueId","frameId","frameType","defensiveTeam","defense", target_column] + features

def load_week_df(week: int) -> pd.DataFrame:
    path = f"{ART}/week_{week:02d}_clean_blitz.parquet"
    df = pd.read_parquet(path)
    missing = [c for c in NEEDED if c not in df.columns]
    if missing:
        raise KeyError(f"Week {week:02d} missing columns: {missing}. "
                       "Ensure process_week_data added LOS/disguise/labels first.")
    # keep only rows that have a target (some plays might not have labels if filtered)
    df = df.dropna(subset=[target_column]).copy()
    return df

# NOTE: this calls the new defense-only packer you wrote earlier
# def prepare_frame_data_blitz(df, features, target_column, K=8) -> (torch.Tensor[N,K,F], torch.Tensor[N])

for week_eval in range(1, 10):
    print(f"\n=== Eval week {week_eval:02d} ===")

    # ------- Validation (single week) -------
    val_df = load_week_df(week_eval)
    val_features, val_targets = prepare_frame_data_blitz(val_df, features, target_column, K=K_DEF)
    if val_features is None:
        print(f"[val] week {week_eval:02d}: no frames after packing; skipping.")
        continue

    # quick random sample check
    ridx = random.randrange(len(val_features))
    print(f"[val] shape={val_features.shape}  sample[{ridx}][0]={val_features[ridx][0]}")

    torch.save(val_features, f"{ART}/features_val_week{week_eval:02d}.pt")
    torch.save(val_targets,  f"{ART}/targets_val_week{week_eval:02d}.pt")

    # ------- Training (all other weeks) saved as shards -------
    shard_feats, shard_tgts = [], []
    for wk in range(1, 10):
        if wk == week_eval:
            continue
        tr_df = load_week_df(wk)
        trX, trY = prepare_frame_data_blitz(tr_df, features, target_column, K=K_DEF)
        if trX is None:
            print(f"[train shard] week {wk:02d}: empty after packing; skipping.")
            continue

        fpath = f"{ART}/features_train_week{week_eval:02d}_shard_w{wk:02d}.pt"
        tpath = f"{ART}/targets_train_week{week_eval:02d}_shard_w{wk:02d}.pt"
        torch.save(trX, fpath)
        torch.save(trY, tpath)
        shard_feats.append(fpath); shard_tgts.append(tpath)

        rtr = random.randrange(len(trX))
        print(f"[train shard w{wk:02d}] shape={trX.shape}  sample[{rtr}][0]={trX[rtr][0]}")

    print(f"Saved {len(shard_feats)} train shards for eval week {week_eval:02d} in {ART}")

    # OPTIONAL: if you truly need one big train tensor (can be large), concat on-disk cautiously:
    # catX, catY = [], []
    # for f,t in zip(shard_feats, shard_tgts):
    #     catX.append(torch.load(f, map_location='cpu'))
    #     catY.append(torch.load(t, map_location='cpu'))
    # train_features = torch.cat(catX, dim=0)
    # train_targets  = torch.cat(catY, dim=0)
    # torch.save(train_features, f"{ART}/features_training_week{week_eval:02d}.pt")
    # torch.save(train_targets,  f"{ART}/targets_training_week{week_eval:02d}.pt")
    # print(f"[train concat] {train_features.shape}")



=== Eval week 01 ===
[val] shape=torch.Size([8501, 8, 9])  sample[3381][0]=tensor([ 3.6000e+01,  2.2100e+01, -1.1547e+00, -4.7876e-01,  4.2000e-01,
        -9.2375e-01, -2.5000e-03, -2.5000e-03,  0.0000e+00])
[train shard w02] shape=torch.Size([7759, 8, 9])  sample[6899][0]=tensor([107.6100,  31.8100,   1.5057,  -2.0083,   1.6500,   0.5999,   0.2375,
         -0.3100,   2.5275])
[train shard w03] shape=torch.Size([8541, 8, 9])  sample[2730][0]=tensor([ 6.0400e+01,  3.7890e+01, -1.5985e-02,  1.2020e-02,  5.4000e-01,
        -7.9927e-01,  0.0000e+00,  2.5000e-03,  2.5000e-02])
[train shard w04] shape=torch.Size([7348, 8, 9])  sample[2332][0]=tensor([ 4.4950e+01,  2.4640e+01, -1.0345e+00, -1.0690e-01,  3.1000e-01,
        -9.9470e-01, -1.0000e-02, -2.5000e-03,  7.2500e-02])
[train shard w05] shape=torch.Size([7982, 8, 9])  sample[5018][0]=tensor([ 6.9840e+01,  2.8200e+01, -5.4999e-01,  2.3998e-03,  8.8000e-01,
        -9.9999e-01, -2.0000e-03,  0.0000e+00,  1.2000e-02])
[train shard w06]

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BlitzTransformer(nn.Module):
    """
    Input:  x [B, K, F*]  (K defenders, F* can vary; we fix to feature_len internally)
    Output: logits [B]
    """
    def __init__(self,
                 feature_len=9,
                 model_dim=128,
                 num_heads=4,
                 num_layers=2,
                 dim_feedforward=512,
                 dropout=0.1):
        super().__init__()
        self.feature_len = feature_len

        self.bn = nn.BatchNorm1d(feature_len)

        self.embed = nn.Sequential(
            nn.Linear(feature_len, model_dim),
            nn.ReLU(),
            nn.LayerNorm(model_dim),
            nn.Dropout(dropout),
        )

        enc_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )
        self.enc = nn.TransformerEncoder(enc_layer, num_layers=num_layers)

        self.reduce = nn.Linear(model_dim * 2, model_dim)
        self.head = nn.Sequential(
            nn.Linear(model_dim, model_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(model_dim, model_dim // 4),
            nn.ReLU(),
            nn.LayerNorm(model_dim // 4),
            nn.Linear(model_dim // 4, 1),
        )

    @staticmethod
    def _fixF(x: torch.Tensor, F_target: int) -> torch.Tensor:
        """Pad with zeros or truncate to match F_target on the last dimension."""
        F_cur = x.shape[-1]
        if F_cur == F_target:
            return x
        if F_cur < F_target:
            pad = x.new_zeros(*x.shape[:-1], F_target - F_cur)
            return torch.cat([x, pad], dim=-1)
        return x[..., :F_target]

    def forward(self, x, mask=None):
        x = self._fixF(x, self.feature_len)               # [B,K,F*] → [B,K,F]

        # ⬇️ correct permutation round-trip
        x = x.permute(0, 2, 1)                            # [B,F,K]
        x = self.bn(x)                                    # BN over feature dim (F)
        x = x.permute(0, 2, 1)                            # ✅ back to [B,K,F]

        x = self.embed(x)                                 # [B,K,D]
        h = self.enc(x)                                   # [B,K,D]

        if mask is None:
            mask = (x.abs().sum(dim=-1) > 0).float()      # [B,K]
        m = mask.unsqueeze(-1)                            # [B,K,1]
        safe = m.sum(dim=1).clamp_min(1e-6)
        mean = (h * m).sum(dim=1) / safe                  # [B,D]
        mx   = (h + (1.0 - m) * (-1e9)).amax(dim=1)       # [B,D]
        pooled = torch.cat([mean, mx], dim=-1)            # [B,2D]
        pooled = self.reduce(pooled)                      # [B,D]
        logit  = self.head(pooled).squeeze(-1)            # [B]
        return logit


In [41]:

from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from torch.optim import AdamW
pd.options.mode.chained_assignment = None

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
from torch.utils.data import TensorDataset, DataLoader

def ensure_feature_dim(x, F_expected):
    """Pad with zeros or truncate to match F_expected."""
    F = x.shape[-1]
    if F == F_expected:
        return x
    if F < F_expected:
        pad = torch.zeros(*x.shape[:-1], F_expected - F, device=x.device, dtype=x.dtype)
        return torch.cat([x, pad], dim=-1)
    # F > F_expected: truncate (not ideal, better to regenerate consistently)
    return x[..., :F_expected]

In [43]:
import os, glob, torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader, TensorDataset

# ---------- streaming dataset over shards ----------
class ShardStream(IterableDataset):
    """Streams samples from shard files (one sample at a time)."""
    def __init__(self, feat_paths, tgt_paths, device=None):
        assert len(feat_paths) == len(tgt_paths), "Mismatched shard counts"
        self.feat_paths = feat_paths
        self.tgt_paths  = tgt_paths
        self.device     = device
    def __iter__(self):
        for fp, tp in zip(self.feat_paths, self.tgt_paths):
            X = torch.load(fp, map_location="cpu").to(dtype=torch.float32)   # [N,K,F*]
            y = torch.load(tp, map_location="cpu").to(dtype=torch.float32)   # [N]
            for i in range(X.shape[0]):
                xi, yi = X[i], y[i]
                if self.device:
                    xi = xi.to(self.device, non_blocking=True)
                    yi = yi.to(self.device, non_blocking=True)
                yield xi, yi

# ------------ training loop ------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 64
learning_rate = 2e-4
num_epochs = 10
early_stopping_patience = 5

ART = "/content/drive/MyDrive/bdb25-blitz/artifacts"
weeks_train = [1,2,3,4,5,6,7,8,9]

for week_eval in weeks_train:
    print(f"\n######## WEEK {week_eval:02d} ########")

    # collect train shards
    feat_paths = sorted(glob.glob(os.path.join(ART, f"features_train_week{week_eval:02d}_shard_w*.pt")))
    tgt_paths  = sorted(glob.glob(os.path.join(ART, f"targets_train_week{week_eval:02d}_shard_w*.pt")))
    if not feat_paths:
        raise FileNotFoundError(f"No train shards for eval week {week_eval:02d} in {ART}")

    # load val
    vaX = torch.load(os.path.join(ART, f"features_val_week{week_eval:02d}.pt"), map_location="cpu").to(torch.float32)
    vaY = torch.load(os.path.join(ART, f"targets_val_week{week_eval:02d}.pt"),  map_location="cpu").to(torch.float32)

    # decide a single feature_len (BN & first linear need a fixed size).
    # pick the max feature width across val + all shards
    F_candidates = [vaX.shape[-1]]
    for fp in feat_paths:
        with torch.no_grad():
            F_candidates.append(torch.load(fp, map_location="cpu").shape[-1])
    F_model = max(F_candidates)
    print(f"[week {week_eval:02d}] inferred feature_len (F_model) = {F_model}")

    # class imbalance across all shards
    pos = neg = 0
    for tp in tgt_paths:
        y = torch.load(tp, map_location="cpu").to(torch.float32)
        pos += int((y == 1).sum().item())
        neg += int((y == 0).sum().item())
    pos_weight = torch.tensor([max(1.0, neg / max(1, pos))], device=device)
    print(f"[week {week_eval:02d}] pos={pos} neg={neg} pos_weight={pos_weight.item():.2f}")

    # model / opt / loss
    model = BlitzTransformer(
        feature_len=F_model,   # <-- key: BN & first Linear expect this width
        model_dim=128, num_heads=4, num_layers=2,
        dim_feedforward=512, dropout=0.1
    ).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)

    # loaders
    train_stream = ShardStream(feat_paths, tgt_paths, device=device)
    train_loader = DataLoader(train_stream, batch_size=batch_size, shuffle=False, num_workers=0)
    val_loader   = DataLoader(TensorDataset(vaX.to(device), vaY.to(device)),
                              batch_size=batch_size, shuffle=False, num_workers=0)

    # train/eval
    best_val, no_improve = float("inf"), 0
    for epoch in range(num_epochs):
        # at top of the week loop
        ckpt_best = os.path.join(ART, f"best_model_week{week_eval:02d}.pth")
        ckpt_last = os.path.join(ART, f"last_model_week{week_eval:02d}.pth")
        best_val, no_improve = float("inf"), 0
        saved_any = False
        # train
        model.train(); run = 0.0; n = 0
        first_batch_logged = False
        for xb, yb in train_loader:
            if not first_batch_logged:
                print(f"[train] xb shape {tuple(xb.shape)}  (B,K,F*), model.feature_len={model.feature_len}")
                first_batch_logged = True
            optimizer.zero_grad()
            logits = model(xb)                  # model pads/truncates internally to feature_len
            loss = criterion(logits, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            run += loss.item()*xb.size(0); n += xb.size(0)
        train_loss = run / max(1, n)

        # val
        model.eval(); vrun = 0.0; vn = 0; correct = 0
        first_val_logged = False
        with torch.no_grad():
            for xb, yb in val_loader:
                if not first_val_logged:
                    print(f"[val]   xb shape {tuple(xb.shape)}  (B,K,F*), model.feature_len={model.feature_len}")
                    first_val_logged = True
                logits = model(xb)
                vloss = criterion(logits, yb)
                vrun += vloss.item()*xb.size(0); vn += xb.size(0)
                preds = (torch.sigmoid(logits) >= 0.5).long()
                correct += (preds == yb.long()).sum().item()
        val_loss = vrun / max(1, vn)
        val_acc  = correct / max(1, vn)
        print(f"Epoch {epoch+1:02d}  train {train_loss:.4f}  val {val_loss:.4f}  acc {val_acc:.3f}")

        if val_loss < best_val:
          best_val, no_improve = val_loss, 0
          torch.save(model.state_dict(), ckpt_best)
          saved_any = True
        else:
            no_improve += 1
            if no_improve >= early_stopping_patience:
                print("Early stopping.")
                break

        # after the epoch loop finishes (always save a fallback)
        torch.save(model.state_dict(), ckpt_last)
        print(f"Saved fallback checkpoint: {ckpt_last}")
        if not saved_any:
            # also mirror as "best" to simplify downstream code
            torch.save(model.state_dict(), ckpt_best)
            print(f"No improvement checkpoint found; mirrored last -> {ckpt_best}")



######## WEEK 01 ########
[week 01] inferred feature_len (F_model) = 9
[week 01] pos=182 neg=59264 pos_weight=325.63
[train] xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
[val]   xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
Epoch 01  train nan  val nan  acc 0.996
Saved fallback checkpoint: /content/drive/MyDrive/bdb25-blitz/artifacts/last_model_week01.pth
No improvement checkpoint found; mirrored last -> /content/drive/MyDrive/bdb25-blitz/artifacts/best_model_week01.pth
[train] xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
[val]   xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
Epoch 02  train nan  val nan  acc 0.996
Saved fallback checkpoint: /content/drive/MyDrive/bdb25-blitz/artifacts/last_model_week01.pth
No improvement checkpoint found; mirrored last -> /content/drive/MyDrive/bdb25-blitz/artifacts/best_model_week01.pth
[train] xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
[val]   xb shape (64, 8, 9)  (B,K,F*), model.feature_len=9
Epoch 03  train nan  val n

In [44]:
import glob, os

for root in [
    "/content/drive/MyDrive/nfl-big-data-bowl-2025/artifacts",
    "/content/drive/MyDrive/bdb25-blitz/artifacts",
]:
    if os.path.isdir(root):
        files = glob.glob(os.path.join(root, "*.pth"))
        print(root, "->", len(files), "files")
        for f in files[:5]:
            print("  ", os.path.basename(f))


/content/drive/MyDrive/bdb25-blitz/artifacts -> 19 files
   best_model_week9.pth
   last_model_week01.pth
   best_model_week01.pth
   last_model_week02.pth
   best_model_week02.pth


In [45]:
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from torch.optim import AdamW
pd.options.mode.chained_assignment = None
import warnings
import random

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import os
import pandas as pd

def process_week_data_preds(
    week_number: int,
    plays: pd.DataFrame,
    data_root: str = "/content/drive/MyDrive/bdb25-blitz/data/raw",
    artifacts_dir: str = "/content/drive/MyDrive/bdb25-blitz/artifacts",
):
    """
    Returns a DataFrame for the given week that’s ready for inference.
    Prefers the cleaned/windowed parquet produced during preprocessing:
        {artifacts_dir}/week_{week:02d}_clean_blitz.parquet

    If that parquet doesn’t exist and a function `process_week_data(...)`
    is available in the notebook, it will call it to create the parquet.
    Otherwise, raises a helpful error.
    """
    os.makedirs(artifacts_dir, exist_ok=True)
    parquet_path = os.path.join(artifacts_dir, f"week_{week_number:02d}_clean_blitz.parquet")

    # 1) Fast path: load the cleaned parquet if present
    if os.path.exists(parquet_path):
        df = pd.read_parquet(parquet_path)
        # keep everything; you can filter later as needed
        return df

    # 2) Fallback: try to build it via your existing preprocess
    if "process_week_data" in globals():
        print(f"[week {week_number:02d}] cleaned parquet not found; building via process_week_data(...)")
        out = process_week_data(
            week_number=week_number,
            plays=plays,
            data_root=os.path.join(data_root),             # uses tracking_week_{week}.csv inside
            out_dir=artifacts_dir,
            even_frames_only=True,
            label_blitz=True,
            add_disguise=True,
            return_df=True
        )
        return out

    # 3) Otherwise, tell the user what’s missing
    raise FileNotFoundError(
        f"Missing cleaned file and no builder available:\n"
        f" - expected parquet: {parquet_path}\n"
        f" - to build it, define process_week_data(...) earlier or run your preprocessing cell."
    )


In [47]:

# reading static CSV files (currently in GDrive)
games = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/games.csv")
player_play = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/player_play.csv")
players = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/players.csv")
plays = pd.read_csv("/content/drive/MyDrive/bdb25-blitz/data/raw/plays.csv")

all_weeks = []

for week_number in range(1, 10):
  week_data = process_week_data_preds(week_number, plays)
  all_weeks.append(week_data)

all_tracking = pd.concat(all_weeks, ignore_index=True)
all_tracking = all_tracking[(all_tracking['club'] != 'football') & (all_tracking['passAttempt'] == 1)]

In [57]:
# ~20 mins per week (still unbatched frame loop)
# Uses BlitzTransformer (single logit -> sigmoid) and blitz labels

import warnings, os
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn as nn

# -------- config --------
ART = "/content/drive/MyDrive/bdb25-blitz/artifacts"   # where best_model_weekXX.pth + tensors live
OUT = "/content/drive/MyDrive/bdb25-blitz"             # where weekX_preds.csv will be written
os.makedirs(OUT, exist_ok=True)

FEATURES = [
    "x_clean","y_clean","v_x","v_y",
    "depth_to_los","o_to_los_cos",
    "creep_depth_mean","creep_lat_mean","pre_speed_mean"
]
K_DEF = 8  # pack top-K defenders per frame

# -------- model (self-healing on F) --------
class BlitzTransformer(nn.Module):
    def __init__(self, feature_len=9, model_dim=128, num_heads=4, num_layers=2, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.feature_len = feature_len
        self.bn = nn.BatchNorm1d(feature_len)
        self.embed = nn.Sequential(
            nn.Linear(feature_len, model_dim),
            nn.ReLU(),
            nn.LayerNorm(model_dim),
            nn.Dropout(dropout),
        )
        enc = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.enc = nn.TransformerEncoder(enc, num_layers=num_layers)
        self.reduce = nn.Linear(model_dim*2, model_dim)
        self.head = nn.Sequential(
            nn.Linear(model_dim, model_dim), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(model_dim, model_dim//4), nn.ReLU(), nn.LayerNorm(model_dim//4),
            nn.Linear(model_dim//4, 1),
        )

    @staticmethod
    def _fixF(x: torch.Tensor, F_target: int) -> torch.Tensor:
        F_cur = x.shape[-1]
        if F_cur == F_target: return x
        if F_cur < F_target:
            pad = x.new_zeros(*x.shape[:-1], F_target - F_cur)
            return torch.cat([x, pad], dim=-1)
        return x[..., :F_target]

    def forward(self, x, mask=None):
        # enforce [B,K,F]
        x = self._fixF(x, self.feature_len)
        x = x.permute(0,2,1)               # [B,F,K]
        x = self.bn(x)
        x = x.permute(0,2,1)               # [B,K,F]
        x = self.embed(x)                  # [B,K,D]
        h = self.enc(x)                    # [B,K,D]
        if mask is None:
            mask = (x.abs().sum(-1) > 0).float()
        m = mask.unsqueeze(-1)
        safe = m.sum(1).clamp_min(1e-6)
        mean = (h*m).sum(1) / safe
        mx   = (h + (1.0-m)*(-1e9)).amax(1)
        pooled = torch.cat([mean, mx], -1)
        pooled = self.reduce(pooled)
        return self.head(pooled).squeeze(-1)   # logits [B]

# -------- pack one frame -> [1, K_DEF, F] --------
def prepare_tensor_blitz(frame_pd, features, K=8, device="cpu"):
    # defenders only (exclude ball)
    df = frame_pd[(frame_pd.get("defense", 0) == 1) & (frame_pd["club"] != "football")].copy()

    # ensure all feature cols exist
    for c in features:
        if c not in df.columns:
            df[c] = 0.0

    # ⬅️ fill any NaNs to 0 to avoid NaN logits
    df[features] = df[features].fillna(0.0)

    # rank defenders (closest to LOS first; missing → large)
    d2l = df["depth_to_los"].fillna(9999)
    df = df.assign(_rank_key=d2l).sort_values("_rank_key", ascending=True)

    X = df[features].to_numpy(dtype=np.float32)

    # pad/truncate to K
    if X.shape[0] < K:
        X = np.vstack([X, np.zeros((K - X.shape[0], X.shape[1]), dtype=np.float32)])
    else:
        X = X[:K]

    return torch.from_numpy(X).unsqueeze(0).to(device)


# ------------ inference loop (frame-by-frame) ------------
import warnings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for week_eval in range(1, 10):
    tracking_df = all_tracking[all_tracking['week'] == week_eval]  # <-- if you switched to parquet, read that instead
    tracking_df_polars = pl.DataFrame(tracking_df)

    list_ids = list(set(tracking_df['frameUniqueId']))

    best_model_path = f"{ART}/best_model_week{week_eval:02d}.pth"
    model = BlitzTransformer(feature_len=len(FEATURES),
                             model_dim=128, num_heads=4, num_layers=2,
                             dim_feedforward=512, dropout=0.1).to(device)
    state = torch.load(best_model_path, map_location=device)
    model.load_state_dict(state)
    model.eval()

    results = []

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=DeprecationWarning)
        print(f"Starting loop for week {week_eval}... (frames: {len(list_ids)})")

        for idx, frame_id in enumerate(list_ids, start=1):
            if idx % 20000 == 0:
                print(f"Processed {idx} frame_ids for week {week_eval}... ({idx/len(list_ids):.2%})")

            play_id = "_".join(frame_id.split("_")[:2])
            frame_num = int(frame_id.split("_")[-1])

            # fetch frame rows (all players in that frame)
            frame = tracking_df_polars.filter(pl.col("frameUniqueId") == frame_id).to_pandas()

            # build input
            x = prepare_tensor_blitz(frame, FEATURES, K_DEF, device=device)

            with torch.no_grad():
                logit = model(x)                       # [1]
                prob  = torch.sigmoid(logit).item()    # blitz probability

                pred = 1 if prob >= 0.5 else 0
                actual = frame["blitz"].iloc[0] if "blitz" in frame.columns else np.nan

                results.append({
                    "frameUniqueId": frame_id,
                    "uniqueId": play_id,
                    "frameId": frame_num,
                    "blitz_prob": prob,
                    "pred": pred,
                    "actual": actual
                })

        week_results = pd.DataFrame(results)
        out_csv = f"{OUT}/week{week_eval}_preds.csv"
        week_results.to_csv(out_csv, index=False)
        print(f"Finished week {week_eval}... saved {out_csv}\n")


Starting loop for week 1... (frames: 7965)
Finished week 1... saved /content/drive/MyDrive/bdb25-blitz/week1_preds.csv

Starting loop for week 2... (frames: 7318)
Finished week 2... saved /content/drive/MyDrive/bdb25-blitz/week2_preds.csv

Starting loop for week 3... (frames: 8009)
Finished week 3... saved /content/drive/MyDrive/bdb25-blitz/week3_preds.csv

Starting loop for week 4... (frames: 6928)
Finished week 4... saved /content/drive/MyDrive/bdb25-blitz/week4_preds.csv

Starting loop for week 5... (frames: 7506)
Finished week 5... saved /content/drive/MyDrive/bdb25-blitz/week5_preds.csv

Starting loop for week 6... (frames: 6690)
Finished week 6... saved /content/drive/MyDrive/bdb25-blitz/week6_preds.csv

Starting loop for week 7... (frames: 6545)
Finished week 7... saved /content/drive/MyDrive/bdb25-blitz/week7_preds.csv

Starting loop for week 8... (frames: 6774)
Finished week 8... saved /content/drive/MyDrive/bdb25-blitz/week8_preds.csv

Starting loop for week 9... (frames: 596

In [58]:
for week_eval in range(1, 10):
    week_df = all_tracking[all_tracking['week'] == week_eval]   # or read week parquet if that’s your source
    preds_week = pd.read_csv(f"/content/drive/MyDrive/bdb25-blitz/week{week_eval}_preds.csv")

    preds_week = preds_week[["frameUniqueId","blitz_prob","pred","actual"]]
    tracking_preds = week_df.merge(preds_week, on="frameUniqueId", how="left")

    out_csv = f"/content/drive/MyDrive/bdb25-blitz/tracking_week_{week_eval}_preds.csv"
    tracking_preds.to_csv(out_csv, index=False)
    print(f"Saved {out_csv}")


Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_1_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_2_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_3_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_4_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_5_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_6_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_7_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_8_preds.csv
Saved /content/drive/MyDrive/bdb25-blitz/tracking_week_9_preds.csv


In [59]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    roc_auc_score, average_precision_score, confusion_matrix
)

def _safe_binary_arrays(df, prob_col="blitz_prob", pred_col="pred", label_col="actual"):
    """Return clean numpy arrays y_true, y_prob, y_pred with NaNs removed & types fixed."""
    # keep rows with proper labels and probabilities
    m = df[label_col].isin([0,1]) & df[prob_col].notna()
    d = df.loc[m, [label_col, prob_col, pred_col]].copy()

    # coerce types
    d[label_col] = d[label_col].astype(int)
    d[prob_col]  = d[prob_col].astype(float).clip(0,1)
    # if pred isn't present or has NaNs, derive it from prob>=0.5
    if pred_col not in d or d[pred_col].isna().any():
        d[pred_col] = (d[prob_col] >= 0.5).astype(int)
    else:
        d[pred_col] = d[pred_col].fillna(0).astype(int)

    return d[label_col].to_numpy(), d[prob_col].to_numpy(), d[pred_col].to_numpy()

def _safe_auc(y_true, y_prob):
    """Guard AUROC/AP for single-class slices."""
    try:
        auroc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auroc = np.nan
    try:
        ap = average_precision_score(y_true, y_prob)
    except ValueError:
        ap = np.nan
    return auroc, ap

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, average_precision_score

def eval_slice(df, name="overall"):
    if df.empty:
        return pd.Series({"slice": name, "n": 0, "pos": 0, "acc": np.nan, "prec": 0., "rec": 0., "f1": 0., "auroc": np.nan, "pr_auc": np.nan})

    y_true = df["actual"].values
    y_prob = df["blitz_prob"].values
    y_pred = df["pred"].values

    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    try:    auroc = roc_auc_score(y_true, y_prob)
    except: auroc = np.nan
    try:    ap = average_precision_score(y_true, y_prob)
    except: ap = np.nan

    return pd.Series({"slice": name, "n": len(y_true), "pos": int((y_true==1).sum()),
                      "acc": acc, "prec": prec, "rec": rec, "f1": f1, "auroc": auroc, "pr_auc": ap})




In [60]:
all_preds = []
for wk in range(1,10):
    df = pd.read_csv(f"/content/drive/MyDrive/bdb25-blitz/tracking_week_{wk}_preds.csv")
    df["week"] = wk
    all_preds.append(df)
df_all = pd.concat(all_preds, ignore_index=True)



overall = eval_slice(df_all, "frames: overall")
by_week = df_all.groupby("week").apply(lambda g: eval_slice(g, f"frames: week{int(g.name):02d}")).reset_index(drop=True)

display(overall.to_frame().T)
display(by_week)


  by_week = df_all.groupby("week").apply(lambda g: eval_slice(g, f"frames: week{int(g.name):02d}")).reset_index(drop=True)


Unnamed: 0,slice,n,pos,acc,prec,rec,f1,auroc,pr_auc
0,frames: overall,1401466,4466,0.996813,0.0,0.0,0.0,,


Unnamed: 0,slice,n,pos,acc,prec,rec,f1,auroc,pr_auc
0,frames: week01,175230,616,0.996485,0.0,0.0,0.0,,
1,frames: week02,160996,616,0.996174,0.0,0.0,0.0,,
2,frames: week03,176198,924,0.994756,0.0,0.0,0.0,,
3,frames: week04,152416,154,0.99899,0.0,0.0,0.0,,
4,frames: week05,165132,308,0.998135,0.0,0.0,0.0,,
5,frames: week06,147180,770,0.994768,0.0,0.0,0.0,,
6,frames: week07,143990,308,0.997861,0.0,0.0,0.0,,
7,frames: week08,149028,616,0.995867,0.0,0.0,0.0,,
8,frames: week09,131296,154,0.998827,0.0,0.0,0.0,,


In [None]:
df_win = df_all.copy()
if "frames_from_snap" in df_win.columns:
    df_win = df_win[df_win["frames_from_snap"].between(-8, 5)]

play_agg = (df_win.groupby("uniqueId")
            .agg({"blitz_prob":"max", "actual":"max", "defensiveTeam":"first", "week":"first"})
            .reset_index())
play_agg["pred"] = (play_agg["blitz_prob"] >= 0.5).astype(int)

overall_play = eval_slice(play_agg, "plays: overall")
by_week_play = play_agg.groupby("week").apply(lambda g: eval_slice(g, f"plays: week{int(g.name):02d}")).reset_index(drop=True)

display(overall_play.to_frame().T)
display(by_week_play)


  by_week_play = play_agg.groupby("week").apply(lambda g: eval_slice(g, f"plays: week{int(g.name):02d}")).reset_index(drop=True)


Unnamed: 0,slice,n,pos,acc,prec,rec,f1,auroc,pr_auc
0,plays: overall,9120,29,0.99682,0.0,0.0,0.0,,


Unnamed: 0,slice,n,pos,acc,prec,rec,f1,auroc,pr_auc
0,plays: week01,1141,4,0.996494,0.0,0.0,0.0,,
1,plays: week02,1048,4,0.996183,0.0,0.0,0.0,,
2,plays: week03,1147,6,0.994769,0.0,0.0,0.0,,
3,plays: week04,991,1,0.998991,0.0,0.0,0.0,,
4,plays: week05,1074,2,0.998138,0.0,0.0,0.0,,
5,plays: week06,959,5,0.994786,0.0,0.0,0.0,,
6,plays: week07,937,2,0.997866,0.0,0.0,0.0,,
7,plays: week08,969,4,0.995872,0.0,0.0,0.0,,
8,plays: week09,854,1,0.998829,0.0,0.0,0.0,,


In [62]:
hi_conf = play_agg[(play_agg["actual"]==1) & (play_agg["blitz_prob"]>=0.9)] \
            .sort_values("blitz_prob", ascending=False).head(25)
missed  = play_agg[(play_agg["actual"]==1) & (play_agg["blitz_prob"]<=0.2)] \
            .sort_values("blitz_prob", ascending=True).head(25)
false_a = play_agg[(play_agg["actual"]==0) & (play_agg["blitz_prob"]>=0.9)] \
            .sort_values("blitz_prob", ascending=False).head(25)

display(hi_conf[["uniqueId","defensiveTeam","down","yardsToGo","blitz_prob"]])
display(missed[["uniqueId","defensiveTeam","down","yardsToGo","blitz_prob"]])
display(false_a[["uniqueId","defensiveTeam","down","yardsToGo","blitz_prob"]])


NameError: name 'play_agg' is not defined

In [56]:
df_all

Unnamed: 0,gameId,playId,nflId,displayName,frameId,frameType,time,jerseyNumber,club,playDirection,...,pre_speed_mean,pre_speed_max,pre_face_cos,pre_depth_mean,blitz,num_rushers,sim_blitz,blitz_prob,pred,actual
0,2022091200,85,35459.0,Kareem Jackson,110,BEFORE_SNAP,2022-09-13 00:16:51.4,22.0,DEN,right,...,0.775,0.87,0.956425,14.699999,0,1,0,,0,0
1,2022091200,85,35459.0,Kareem Jackson,112,BEFORE_SNAP,2022-09-13 00:16:51.6,22.0,DEN,right,...,0.775,0.87,0.956425,14.699999,0,1,0,,0,0
2,2022091200,85,35459.0,Kareem Jackson,114,BEFORE_SNAP,2022-09-13 00:16:51.8,22.0,DEN,right,...,0.775,0.87,0.956425,14.699999,0,1,0,,0,0
3,2022091200,85,35459.0,Kareem Jackson,116,BEFORE_SNAP,2022-09-13 00:16:52,22.0,DEN,right,...,0.775,0.87,0.956425,14.699999,0,1,0,,0,0
4,2022091200,85,35459.0,Kareem Jackson,118,AFTER_SNAP,2022-09-13 00:16:52.2,22.0,DEN,right,...,0.775,0.87,0.956425,14.699999,0,1,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401461,2022110300,3435,55045.0,Kurt Hinish,62,BEFORE_SNAP,2022-11-04 02:59:02,93.0,HOU,left,...,0.022,0.11,-0.804744,0.460003,0,0,0,,0,0
1401462,2022110300,3435,55045.0,Kurt Hinish,64,BEFORE_SNAP,2022-11-04 02:59:02.2,93.0,HOU,left,...,0.022,0.11,-0.804744,0.460003,0,0,0,,0,0
1401463,2022110300,3435,55045.0,Kurt Hinish,66,SNAP,2022-11-04 02:59:02.4,93.0,HOU,left,...,0.022,0.11,-0.804744,0.460003,0,0,0,,0,0
1401464,2022110300,3435,55045.0,Kurt Hinish,68,AFTER_SNAP,2022-11-04 02:59:02.6,93.0,HOU,left,...,0.022,0.11,-0.804744,0.460003,0,0,0,,0,0
