In [None]:
def rotate_direction_and_orientation(df):

  """
  Rotate the direction and orientation angles so that 0° points from left to right on the field, and increasing angle goes counterclockwise
  This should be done BEFORE the call to make_plays_left_to_right, because that function with compensate for the flipped angles.

  :param df: the aggregate dataframe created using the aggregate_data() method

  :return df: the aggregate dataframe with orientation and direction angles rotated 90° clockwise
  """

  df["o_clean"] = (-(df["o"] - 90)) % 360
  df["dir_clean"] = (-(df["dir"] - 90)) % 360

  return df


def make_plays_left_to_right(df):

  """
  Flip tracking data so that all plays run from left to right. The new x, y, s, a, dis, o, and dir data
  will be stored in new columns with the suffix "_clean" even if the variables do not change from their original value.

  :param df: the aggregate dataframe created using the aggregate_data() method

  :return df: the aggregate dataframe with the new columns such that all plays run left to right
  """

  df["x_clean"] = np.where(
      df["playDirection"] == "left",
      120 - df["x"],
      df[
          "x"
      ],  # 120 because the endzones (10 yds each) are included in the ["x"] values
  )

  df["y_clean"] = df["y"]
  df["s_clean"] = df["s"]
  df["a_clean"] = df["a"]
  df["dis_clean"] = df["dis"]

  df["o_clean"] = np.where(
      df["playDirection"] == "left", 180 - df["o_clean"], df["o_clean"]
  )

  df["o_clean"] = (df["o_clean"] + 360) % 360  # remove negative angles

  df["dir_clean"] = np.where(
      df["playDirection"] == "left", 180 - df["dir_clean"], df["dir_clean"]
  )

  df["dir_clean"] = (df["dir_clean"] + 360) % 360  # remove negative angles

  return df


import numpy as np

def calculate_velocity_components(df):
    """
    Calculate the velocity components (v_x and v_y) for each row in the dataframe.

    :param df: the aggregate dataframe with "_clean" columns created using make_plays_left_to_right()

    :return df: the dataframe with additional columns 'v_x' and 'v_y' representing the velocity components
    """

    df["dir_radians"] = np.radians(df["dir_clean"])

    df["v_x"] = df["s_clean"] * np.cos(df["dir_radians"])
    df["v_y"] = df["s_clean"] * np.sin(df["dir_radians"])


    return df


def label_offense_defense_coverage(presnap_df, plays_df):

  coverage_replacements = {
    'Cover-3 Cloud Right': 'Cover-3',
    'Cover-3 Cloud Left': 'Cover-3',
    'Cover-3 Seam': 'Cover-3',
    'Cover-3 Double Cloud': 'Cover-3',
    'Cover-6 Right': 'Cover-6',
    'Cover 6-Left': 'Cover-6',
    'Cover-1 Double': 'Cover-1'}

  values_to_drop = ["Miscellaneous", "Bracket", "Prevent", "Red Zone", "Goal Line"]

  plays_df['pff_passCoverage'] = plays_df['pff_passCoverage'].replace(coverage_replacements)

  plays_df = plays_df.dropna(subset=['pff_passCoverage'])
  plays_df = plays_df[~plays_df['pff_passCoverage'].isin(values_to_drop)]

  coverage_mapping = {
      'Cover-0': 0,
      'Cover-1': 1,
      'Cover-2': 2,
      'Cover-3': 3,
      'Quarters': 4,
      '2-Man': 5,
      'Cover-6': 6
  }

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'pff_passCoverage']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['pff_passCoverage'] = merged_df['pff_passCoverage'].map(coverage_mapping)
  merged_df.dropna(subset=['pff_passCoverage'], inplace=True)

  return merged_df


def label_offense_defense_manzone(presnap_df, plays_df):

  plays_df = plays_df.dropna(subset=['pff_manZone'])

  coverage_mapping = {
      'Zone': 0,
      'Man': 1}

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'pff_manZone']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['pff_manZone'] = merged_df['pff_manZone'].map(coverage_mapping)
  merged_df.dropna(subset=['pff_manZone'], inplace=True)

  return merged_df


def label_offense_defense_formation(presnap_df, plays_df):

  """
  Adds 'offense' and 'defense' columns to presnap_df, marking players as offense (1) or defense (0)
  based on possession team and defensive team from plays_df. Enumerates offensive formations
  and removes rows with missing formations.

  Parameters:
  presnap_df (pd.DataFrame): DataFrame containing tracking data with 'gameId', 'playId', and 'club'.
  plays_df (pd.DataFrame): DataFrame containing 'gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'offenseFormation'.

  Returns:
  pd.DataFrame: Updated presnap_df with added 'offense', 'defense', and enumerated 'offenseFormation' columns, with NaN formations dropped.
  """

  formation_mapping = {
      'EMPTY': 0,
      'I_FORM': 1,
      'JUMBO': 2,
      'PISTOL': 3,
      'SHOTGUN': 4,
      'SINGLEBACK': 5,
      'WILDCAT': 6
  }

  merged_df = presnap_df.merge(
      plays_df[['gameId', 'playId', 'possessionTeam', 'defensiveTeam', 'offenseFormation']],
      on=['gameId', 'playId'],
      how='left'
  )

  merged_df['defense'] = ((merged_df['club'] == merged_df['defensiveTeam']) & (merged_df['club'] != 'football')).astype(int)

  merged_df['offenseFormation'] = merged_df['offenseFormation'].map(formation_mapping)
  merged_df.dropna(subset=['offenseFormation'], inplace=True)

  return merged_df


import pandas as pd
import numpy as np

def split_data_by_uniqueId(df, train_ratio=0.7, test_ratio=0.15, val_ratio=0.15, unique_id_column="uniqueId"):

  """
  Split the dataframe into training, testing, and validation sets based on a given ratio while
  ensuring all rows with the same uniqueId are in the same set.

  :param df: the aggregate dataframe containing all frames for each play
  :param train_ratio: proportion of the data to allocate to training (default 0.7)
  :param test_ratio: proportion of the data to allocate to testing (default 0.15)
  :param val_ratio: proportion of the data to allocate to validation (default 0.15)
  :param unique_id_column: the name of the column containing the unique identifiers for each play

  :return: three dataframes (train_df, test_df, val_df) for training, testing, and validation
  """

  unique_ids = df[unique_id_column].unique()
  np.random.shuffle(unique_ids)

  num_ids = len(unique_ids)
  train_end = int(train_ratio * num_ids)
  test_end = train_end + int(test_ratio * num_ids)

  train_ids = unique_ids[:train_end]
  test_ids = unique_ids[train_end:test_end]
  val_ids = unique_ids[test_end:]

  train_df = df[df[unique_id_column].isin(train_ids)]
  test_df = df[df[unique_id_column].isin(test_ids)]
  val_df = df[df[unique_id_column].isin(val_ids)]

  print(f"Train Dataframe Frames: {train_df.shape[0]}")
  print(f"Test Dataframe Frames: {test_df.shape[0]}")
  print(f"Val Dataframe Frames: {val_df.shape[0]}")

  return train_df, test_df, val_df


def pass_attempt_merging(tracking, plays):

  plays['passAttempt'] = np.where(plays['passResult'].isin([np.nan, 'S']), 0, 1)

  plays_for_merge = plays[['gameId', 'playId', 'passAttempt']]

  merged_df = tracking.merge(
      plays_for_merge,
      on=['gameId', 'playId'],
      how='left')

  return merged_df


#def prepare_frame_data(df, features, target_column):

 # features_array = df.groupby("frameUniqueId")[features].apply(
  #    lambda x: x.to_numpy(dtype=np.float32)).to_numpy()

#  try:
#      features_tensor = torch.tensor(np.stack(features_array))
#  except ValueError as e:
 #     print("Skipping batch due to inconsistent shapes in features_array:", e)
 #     return None, None  # or return some placeholder values if needed

#  targets_array = df.groupby("frameUniqueId")[target_column].first().to_numpy()
 # targets_tensor = torch.tensor(targets_array, dtype=torch.long)

 # return features_tensor, targets_tensor


def select_augmented_frames(df, num_samples, sigma=5):

    df_frames = df[['frameUniqueId', 'frames_from_snap']].drop_duplicates()
    weights = np.exp(-((df_frames['frames_from_snap'] + 10) ** 2) / (2 * sigma ** 2))

    weights /= weights.sum()

    selected_frames = np.random.choice(
        df_frames['frameUniqueId'], size=num_samples, replace=False, p=weights
    )

    return selected_frames


def data_augmentation(df, augmented_frames):

  df_sample = df.loc[df['frameUniqueId'].isin(augmented_frames)].copy()

  df_sample['y_clean'] = (160 / 3) - df_sample['y_clean']
  df_sample['dir_radians'] = (2 * np.pi) - df_sample['dir_radians']
  df_sample['dir_clean'] = np.degrees(df_sample['dir_radians'])

  df_sample['frameUniqueId'] = df_sample['frameUniqueId'].astype(str) + '_aug'

  return df_sample

In [None]:
def add_los_depth_and_angles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Robustly compute LOS (los_x) per (gameId, playId) using the frame CLOSEST to the snap.
    Priority:
      1) ball row (club == 'football') at |frames_from_snap| minimum
      2) offense rows at that frame: min x_clean (offense moves L->R)
      3) fallback: per-play min x_clean
    Then add:
      - depth_to_los = x_clean - los_x
      - o_to_los_cos = cos(dir_clean in radians)
    """
    # ensure we have frames_from_snap and x_clean/y_clean/dir_clean
    if "frames_from_snap" not in df.columns:
        raise KeyError("frames_from_snap not found. Compute snap_frame and frames_from_snap before LOS.")
    if "x_clean" not in df.columns or "dir_clean" not in df.columns:
        raise KeyError("x_clean/dir_clean not found. Run make_plays_left_to_right and rotate_direction_and_orientation first.")

    # find the frameId closest to snap per play
    closest_idx = (
        df.loc[:, ["gameId","playId","frameId","frames_from_snap"]]
          .assign(abs_fs=lambda x: x["frames_from_snap"].abs())
          .sort_values(["gameId","playId","abs_fs","frameId"])
          .groupby(["gameId","playId"], as_index=False)
          .first()[["gameId","playId","frameId"]]
          .rename(columns={"frameId": "closest_frame"})
    )
    df = df.merge(closest_idx, on=["gameId","playId"], how="left")

    # 1) Try ball at closest frame
    ball = (
        df[(df["club"] == "football") & (df["frameId"] == df["closest_frame"])]
        .loc[:, ["gameId","playId","x_clean"]]
        .drop_duplicates()
        .rename(columns={"x_clean": "los_x"})
    )

    # 2) Offense at closest frame (defense==0 and not football), take min x_clean
    off_at_closest = (
        df[(df["frameId"] == df["closest_frame"]) & (df["club"] != "football")]
    )
    if "defense" in off_at_closest.columns:
        off_at_closest = off_at_closest[off_at_closest["defense"] == 0]

    off_min = (
        off_at_closest.groupby(["gameId","playId"], as_index=False)["x_clean"]
        .min()
        .rename(columns={"x_clean": "los_x_off"})
    )

    # merge LOS candidates
    df = df.merge(ball, on=["gameId","playId"], how="left")
    df = df.merge(off_min, on=["gameId","playId"], how="left")

    # 3) final los_x: prefer ball, else offense min at closest, else per-play min x_clean
    per_play_min = (
        df.groupby(["gameId","playId"], as_index=False)["x_clean"].min()
          .rename(columns={"x_clean": "los_x_fallback"})
    )
    df = df.merge(per_play_min, on=["gameId","playId"], how="left")

    df["los_x"] = df["los_x"].fillna(df["los_x_off"])
    df["los_x"] = df["los_x"].fillna(df["los_x_fallback"])
    df.drop(columns=["los_x_off","los_x_fallback","closest_frame"], inplace=True, errors="ignore")

    # add depth/angle features
    df["depth_to_los"] = df["x_clean"] - df["los_x"]
    df["o_to_los_cos"] = np.cos(np.radians(df["dir_clean"]))
    return df


In [None]:
import os, gc, numpy as np, pandas as pd

# Window around snap (tracking is ~10 Hz)
HZ = 10
T_PRE_S, T_POST_S = 0.8, 0.5
T_PRE_FRAMES  = int(T_PRE_S  * HZ)   # 8 frames pre
T_POST_FRAMES = int(T_POST_S * HZ)   # 5 frames post

def _ensure_distance_column_inplace(df: pd.DataFrame) -> None:
    """If 'dis' is missing, compute distance moved since previous frame per (gameId, playId, nflId)."""
    if "dis" in df.columns:
        return
    df.sort_values(["gameId","playId","nflId","frameId"], inplace=True)
    same_entity = (
        df["gameId"].diff().eq(0) &
        df["playId"].diff().eq(0) &
        df["nflId"].diff().eq(0)
    )
    dx = df["x"].diff()
    dy = df["y"].diff()
    df["dis"] = np.where(same_entity, np.sqrt(dx*dx + dy*dy), 0.0).astype("float64")

In [None]:
CROSS_TMAX_FRAMES = 13   # ~1.3s after snap at 10 Hz
DL_POSITIONS = {'DT','NT','DE','EDGE','DI'}  # adjust to your position codes

def _first_cross_flags(gdf):
    """Return set of nflIds that cross LOS within CROSS_TMAX_FRAMES after snap."""
    snapF = int(gdf['snap_frame'].iloc[0])
    los_x = float(gdf['los_x'].iloc[0])
    def_team = gdf['defensiveTeam'].iloc[0]
    d = gdf[(gdf['club']==def_team) & (gdf['nflId'].notna())].copy()
    window = d[(d['frameId'] >= snapF) & (d['frameId'] <= snapF + CROSS_TMAX_FRAMES)]
    # Offense is moving L->R; a defender "rusher" is someone whose x_clean <= los_x at any time in window
    crossed = (window.groupby('nflId')['x_clean'].apply(lambda x: (x <= los_x).any()))
    return set(crossed[crossed].index.astype(np.int64))

def make_blitz_labels(play_df, players_df=None):
    """
    Returns a play-level labels DataFrame with columns:
      blitz (0/1), num_rushers, sim_blitz (0/1)
    and merges back to play_df.
    """
    pos_map = {}
    if players_df is not None and {'nflId','position'}.issubset(players_df.columns):
        pos_map = players_df[['nflId','position']].drop_duplicates().set_index('nflId')['position'].to_dict()

    recs = []
    for (g,p), gdf in play_df.groupby(['gameId','playId']):
        rushers = _first_cross_flags(gdf)
        num_rush = len(rushers)
        blitz = int(num_rush >= 5)

        # creeper/simulated: 4 rushers AND any non-DL rushed AND any DL dropped ≥3y behind LOS in first 0.8s
        sim_blitz = 0
        if num_rush == 4 and pos_map:
            non_dl_rushed = any(pos_map.get(nid, '') not in DL_POSITIONS for nid in rushers)
            snapF = int(gdf['snap_frame'].iloc[0])
            los_x = float(gdf['los_x'].iloc[0])
            dl = gdf[(gdf['club']==gdf['defensiveTeam'].iloc[0]) &
                     (gdf['position'].isin(DL_POSITIONS)) &
                     (gdf['frameId'] >= snapF) & (gdf['frameId'] <= snapF+8)].copy()
            dl['depth'] = dl['x_clean'] - los_x
            dl_drop = (dl.groupby('nflId')['depth'].max() >= 3.0).any() if not dl.empty else False
            sim_blitz = int(non_dl_rushed and dl_drop)

        recs.append((g,p,blitz,num_rush,sim_blitz))

    lab = pd.DataFrame(recs, columns=['gameId','playId','blitz','num_rushers','sim_blitz'])
    return play_df.merge(lab, on=['gameId','playId'], how='left')


In [None]:
def add_disguise_features(df: pd.DataFrame, pre_window=(-8, 0)) -> pd.DataFrame:
    lo, hi = pre_window
    pre = df[(df['frames_from_snap'] >= lo) & (df['frames_from_snap'] <= hi)].copy()
    if pre.empty:
        for c in ['creep_depth_mean','creep_depth_max','creep_lat_mean','creep_lat_max',
                  'pre_speed_mean','pre_speed_max','pre_face_cos','pre_depth_mean']:
            df[c] = 0.0
        return df

    pre.sort_values(['gameId','playId','nflId','frameId'], inplace=True)
    pre['delta_depth'] = pre.groupby(['gameId','playId','nflId'])['depth_to_los'].diff().fillna(0.0)
    pre['delta_lat']   = pre.groupby(['gameId','playId','nflId'])['y_clean'].diff().fillna(0.0)

    agg = pre.groupby(['gameId','playId','nflId']).agg(
        creep_depth_mean=('delta_depth','mean'),
        creep_depth_max =('delta_depth','max'),
        creep_lat_mean  =('delta_lat','mean'),
        creep_lat_max   =('delta_lat','max'),
        pre_speed_mean  =('s_clean','mean') if 's_clean' in pre.columns else ('s','mean'),
        pre_speed_max   =('s_clean','max') if 's_clean' in pre.columns else ('s','max'),
        pre_face_cos    =('o_to_los_cos','mean'),
        pre_depth_mean  =('depth_to_los','mean'),
    ).reset_index()

    return df.merge(agg, on=['gameId','playId','nflId'], how='left')

In [None]:
import torch

def _threat_score(sub):
    # nearer LOS (smaller |depth|), moving toward LOS (+cos), faster
    return -np.abs(sub['depth_to_los']) + 0.4*sub['s_clean'] + 0.2*sub['o_to_los_cos']

def prepare_frame_data_blitz(df, features, target_column, K=8):
    """
    Builds tensors:
      X: [N_frames, K, F]  (defense-only, top-K by threat, zero-padded)
      y: [N_frames]        (play-level blitz label broadcast to frames)
    """
    # Defense only and not the ball
    df_def = df[(df['defense']==1) & (df['club']!='football')].copy()

    # play-level labels once per frameUniqueId
    y_map = df_def.groupby('frameUniqueId')[target_column].first().astype(int).to_dict()

    X_list, y_list = [], []
    for fuid, sub in df_def.groupby('frameUniqueId'):
        sub = sub.copy()
        # robust: if any feature missing, fill 0
        for c in features:
            if c not in sub.columns:
                sub[c] = 0.0

        # rank by threat within this frame
        sub['threat'] = _threat_score(sub)
        sub.sort_values('threat', ascending=False, inplace=True)
        mat = sub[features].to_numpy(dtype=np.float32)

        # pad/truncate to K
        if mat.shape[0] >= K:
            matK = mat[:K]
        else:
            pad = np.zeros((K - mat.shape[0], mat.shape[1]), dtype=np.float32)
            matK = np.vstack([mat, pad])

        X_list.append(matK)
        y_list.append(y_map.get(fuid, 0))

    if not X_list:
        return None, None

    X = torch.tensor(np.stack(X_list))   # [N, K, F]
    y = torch.tensor(np.array(y_list), dtype=torch.long)
    return X, y