# Route Dominance Training DataFrame Creation

This notebook creates a comprehensive training DataFrame for route dominance prediction models.

## Overview

**Workflow:**
1. Configure which weeks to process
2. Load input, output, and supplementary data
3. Initialize Route Dominance Scorer
4. Process plays to calculate frame-by-frame metrics
5. Create training-ready DataFrame with all features
6. Analyze and visualize results

**Key Features:**
- Continuous frame numbering (varies by play, starts at 1)
- Throw status (pre-throw vs after-throw)
- Nearest defender coordinates (X, Y positions)
- Completion label (1 = Complete, 0 = Incomplete)
- All dominance metrics (separation, speed, acceleration, leverage angle, time advantage, etc.)
- Route-level aggregated scores
- Route break detection (break frame, frames until/since break, is_break_frame flag)


In [1]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter

try:
    from tqdm import tqdm
except ImportError:
    # Fallback if tqdm not available
    def tqdm(iterable, **kwargs):
        return iterable

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("✓ Libraries imported successfully")


✓ Libraries imported successfully


In [2]:
# create_dominance_gif.py

"""
Create Animated GIF of Receiver Control Score (like CPP GIFs)

This creates an animated GIF showing the receiver's control score evolving frame-by-frame
through a play, just like the CPP pressure GIFs.

Features used from your dataframe:
- receiver_x, receiver_y: Receiver position
- nearest_defender_x, nearest_defender_y: Defender position  
- sep_nearest: Separation distance
- receiver_speed, receiver_accel: Receiver motion
- leverage_angle: Leverage angle
- dist_to_ball: Distance to ball
- ball_land_x_std, ball_land_y_std: Ball landing position
- field_control: Control score (preferred, from RouteDominanceScorer)
- receiver_pressure: Legacy PDF-based pressure score (optional)
- receiver_dominance: Older dominance score (optional)
- continuous_frame or frame_id: Frame number
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from scipy.stats import multivariate_normal
import imageio
import os
import math
from typing import Optional


def create_dominance_gif_from_dataframe(
    training_df: pd.DataFrame,
    game_id: int,
    play_id: int,
    receiver_nfl_id: Optional[int] = None,
    output_filename: str = None,
    fps: int = 5,
    scorer: Optional[object] = None
) -> str:
    """
    Create animated GIF of receiver dominance for a play
    
    Args:
        training_df: Your training dataframe
        game_id: Game ID
        play_id: Play ID
        receiver_nfl_id: Receiver NFL ID (if None, uses first receiver in play)
        output_filename: Output GIF filename (if None, auto-generates)
        fps: Frames per second for GIF
    
    Returns:
        Path to created GIF file
    
    Features used from dataframe:
    - receiver_x, receiver_y: Receiver position
    - nearest_defender_x, nearest_defender_y: Defender position
    - sep_nearest: Separation
    - receiver_speed: Speed
    - leverage_angle: Leverage
    - ball_land_x_std, ball_land_y_std: Ball landing
    - field_control: Control score (preferred)
    - receiver_pressure / receiver_dominance: Legacy scores (if present)
    - continuous_frame: Frame number
    """
    print("="*80)
    print(f"CREATING DOMINANCE GIF FOR GAME {game_id}, PLAY {play_id}")
    print("="*80)
    
    # Get play data from training dataframe (for receiver metrics)
    play_data = training_df[
        (training_df['game_id'] == game_id) &
        (training_df['play_id'] == play_id)
    ].copy()
    
    if len(play_data) == 0:
        raise ValueError(f"Play {game_id}-{play_id} not found in dataframe")
    
    # Sort by frame
    if 'continuous_frame' in play_data.columns:
        play_data = play_data.sort_values('continuous_frame')
        frame_col = 'continuous_frame'
    elif 'frame_id' in play_data.columns:
        play_data = play_data.sort_values('frame_id')
        frame_col = 'frame_id'
    else:
        play_data = play_data.sort_index()
        frame_col = None
    
    # Get receiver NFL ID
    if receiver_nfl_id is None:
        receiver_nfl_id = play_data['nfl_id'].iloc[0]
    
    # Get ALL players on field from scorer (if available)
    all_players_data = None
    if scorer is not None and hasattr(scorer, 'all_frames_df'):
        all_players_data = scorer.all_frames_df[
            (scorer.all_frames_df['game_id'] == game_id) &
            (scorer.all_frames_df['play_id'] == play_id)
        ].copy()
        print(f"\nUsing ALL players from scorer.all_frames_df")
        if len(all_players_data) > 0:
            players_per_frame = all_players_data.groupby('frame_id').size()
            if len(players_per_frame) > 0:
                print(f"  - Total players per frame: ~{players_per_frame.iloc[0]}")
            else:
                print(f"  - Total players: {len(all_players_data)}")
        else:
            print(f"  - No player data found")
    else:
        print(f"\nNote: Scorer not provided - will use limited player data")
        print(f"  To show all players, pass scorer object to create_gif_for_play()")
    
    print(f"\nFeatures used from dataframe:")
    print(f"  - Receiver: nfl_id={receiver_nfl_id}")
    print(f"  - Frames: {len(play_data)} frames")
    
    # Get ball landing (should be same for all frames) - use standardized coordinates
    first_frame = play_data.iloc[0]
    # ball_land_x_std and ball_land_y_std are already standardized in training_df
    ball_land_x = first_frame.get('ball_land_x_std', first_frame.get('ball_land_x', 0))
    ball_land_y = first_frame.get('ball_land_y_std', first_frame.get('ball_land_y', 0))
    print(f"  - Ball landing: ({ball_land_x:.1f}, {ball_land_y:.1f})")
    
    # Verify coordinates are standardized (check first frame receiver position)
    sample_receiver_x = first_frame.get('receiver_x', first_frame.get('x_std', np.nan))
    sample_receiver_y = first_frame.get('receiver_y', first_frame.get('y_std', np.nan))
    print(f"  - Sample receiver position: x={sample_receiver_x:.1f}, y={sample_receiver_y:.1f} (should use x_std/y_std)")
    
    # Derive line of scrimmage (LOS) from scorer.supp_df if available
    los_y = None
    if scorer is not None and hasattr(scorer, "supp_df"):
        supp = getattr(scorer, "supp_df", None)
        if supp is not None and "yardline_100" in supp.columns:
            ctx = supp[
                (supp["game_id"] == game_id) & (supp["play_id"] == play_id)
            ]
            if not ctx.empty:
                yl = ctx.iloc[0]["yardline_100"]
                if not pd.isna(yl):
                    # Standardized so offense always moves toward +y (up); playable field is 10–110
                    los_y = 10.0 + float(yl)

    # Create output directory
    output_dir = "outputs/dominance_gifs"
    os.makedirs(output_dir, exist_ok=True)
    
    if output_filename is None:
        output_filename = f"dominance_game{game_id}_play{play_id}.gif"
    
    output_path = os.path.join(output_dir, output_filename)
    
    # Create images for each frame
    print(f"\nCreating {len(play_data)} frame visualizations...")
    image_files = []
    
    for idx, (frame_idx, frame_row) in enumerate(play_data.iterrows()):
        frame_num = frame_row.get(frame_col, idx + 1) if frame_col else idx + 1
        
        # Get control/pressure score for this frame (for debugging)
        frame_dom = frame_row.get('field_control',
                                  frame_row.get('receiver_pressure',
                                                frame_row.get('receiver_dominance', None)))
        if frame_dom is not None and not pd.isna(frame_dom):
            print(f"  Frame {idx+1}/{len(play_data)} (frame {frame_num}) - Control Score: {frame_dom:.3f}...", end='\r')
        else:
            print(f"  Frame {idx+1}/{len(play_data)} (frame {frame_num})...", end='\r')
        
        # Get all players for this frame (if available)
        frame_players = None
        if all_players_data is not None:
            if frame_col == 'continuous_frame':
                # Map continuous_frame to frame_id and frame_type
                frame_id = frame_row.get('frame_id', frame_num)
                frame_type = frame_row.get('frame_type', 'input')
                frame_players = all_players_data[
                    (all_players_data['frame_id'] == frame_id) &
                    (all_players_data['frame_type'] == frame_type)
                ]
            elif 'frame_id' in frame_row:
                frame_id = frame_row['frame_id']
                frame_type = frame_row.get('frame_type', 'input')
                frame_players = all_players_data[
                    (all_players_data['frame_id'] == frame_id) &
                    (all_players_data['frame_type'] == frame_type)
                ]
        
        # Create visualization for this frame
        fig = create_single_frame_visualization(
            frame_row,
            receiver_nfl_id,
            ball_land_x,
            ball_land_y,
            frame_num=frame_num,
            total_frames=len(play_data),
            all_players_frame=frame_players,  # Pass all players if available
            los_y=los_y,
        )
        
        # Save frame
        frame_filename = os.path.join(output_dir, f"frame_{frame_num:03d}.png")
        plt.savefig(frame_filename, dpi=100, bbox_inches='tight')
        image_files.append(frame_filename)
        plt.close(fig)
    
    print(f"\n  Created {len(image_files)} frame images")
    
    # Create GIF
    print(f"\nCreating GIF from {len(image_files)} frames...")
    images = []
    for filename in image_files:
        images.append(imageio.imread(filename))
    
    imageio.mimsave(output_path, images, fps=fps)
    print(f"  Saved GIF: {output_path}")
    
    # Clean up individual frame images
    for filename in image_files:
        try:
            os.remove(filename)
        except:
            pass
    
    print(f"\nGIF creation complete!")
    print(f"  Output: {output_path}")
    print(f"  Frames: {len(images)}")
    print(f"  FPS: {fps}")
    
    return output_path


def create_single_frame_visualization(
    frame_row: pd.Series,
    receiver_nfl_id: int,
    ball_land_x: float,
    ball_land_y: float,
    frame_num: int = 1,
    total_frames: int = 1,
    all_players_frame: Optional[pd.DataFrame] = None,
    los_y: Optional[float] = None,
) -> plt.Figure:
    """
    Create single frame visualization (one frame of the GIF)
    
    Uses these features from frame_row:
    - receiver_x, receiver_y
    - nearest_defender_x, nearest_defender_y
    - sep_nearest
    - receiver_speed
    - leverage_angle
    - receiver_pressure (preferred) or receiver_dominance (legacy)
    """
    # Extract features from dataframe (use standardized coordinates)
    # IMPORTANT: In the notebook, receiver_x and receiver_y are set from x_std and y_std
    # So receiver_x = x_std (length, 0-120) and receiver_y = y_std (width, 0-53.3)
    # When plotting: x-axis = width (y_std), y-axis = length (x_std)
    receiver_x = frame_row.get('receiver_x', frame_row.get('x_std', frame_row.get('x', 0)))
    receiver_y = frame_row.get('receiver_y', frame_row.get('y_std', frame_row.get('y', 0)))
    def_x = frame_row.get('nearest_defender_x', frame_row.get('x_std', receiver_x + 3))
    def_y = frame_row.get('nearest_defender_y', frame_row.get('y_std', receiver_y))
    sep_nearest = frame_row.get('sep_nearest', 3.0)
    receiver_speed = frame_row.get('receiver_speed', 0)
    leverage_angle = frame_row.get('leverage_angle', np.nan)
    
    # Get control / pressure score from THIS FRAME (this changes frame-by-frame!)
    # Prefer field_control from RouteDominanceScorer; fall back to legacy receiver_pressure / receiver_dominance
    dominance_score = frame_row.get('field_control',
                                    frame_row.get('receiver_pressure',
                                                  frame_row.get('receiver_dominance', None)))
    if dominance_score is None or pd.isna(dominance_score):
        # Fallback: estimate from separation (closer = lower dominance)
        if sep_nearest < np.inf:
            # Simple heuristic: more separation = higher dominance
            dominance_score = min(0.95, max(0.05, 0.5 + (sep_nearest - 3.0) / 20.0))
        else:
            dominance_score = 0.5
    
    # Create figure
    fig, ax = plt.subplots(1, figsize=(10.66, 24))
    
    # Draw field (EXACT CPP STYLE)
    rect = patches.Rectangle((0, 0), 53.3, 120, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0)
    ax.add_patch(rect)
    
    # Field lines
    plt.plot([0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             [10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             color='white', linewidth=1.5)
    
    # End zones
    home_endzone = patches.Rectangle((0, 0), 53.3, 10,
                                    linewidth=0.1,
                                    edgecolor='r',
                                    facecolor='purple',
                                    alpha=0.2,
                                    zorder=10)
    away_endzone = patches.Rectangle((0, 110), 53.3, 10,
                                    linewidth=0.1,
                                    edgecolor='r',
                                    facecolor='purple',
                                    alpha=0.2,
                                    zorder=10)
    ax.add_patch(home_endzone)
    ax.add_patch(away_endzone)
    
    # Yard markers and hash marks
    for y in range(20, 110, 10):
        # Standard NFL-style numbers: 10,20,30,40,50,40,30,20,10
        numb = y
        if y > 50:
            numb = 120 - y
        yard_number = int(numb - 10)

        # Thicker line at midfield (50)
        lw = 2.5 if yard_number == 50 else 1.5
        ax.plot([0, 53.3], [y, y], color='white', linewidth=lw)

        plt.text(5, y-1.5, str(yard_number),
                 horizontalalignment='center',
                 fontsize=18,
                 color='white', rotation=270, fontweight='bold')
        plt.text(53.3 - 5, y - 0.95, str(yard_number),
                 horizontalalignment='center',
                 fontsize=18,
                 color='white', rotation=90, fontweight='bold')
    
    # Hash lines
    for y in range(11, 110):
        ax.plot([0.7, 0.4], [y, y], color='white', linewidth=0.5)
        ax.plot([53.0, 52.5], [y, y], color='white', linewidth=0.5)
        ax.plot([22.91, 23.57], [y, y], color='white', linewidth=0.5)
        ax.plot([29.73, 30.39], [y, y], color='white', linewidth=0.5)

    # Line of scrimmage (if available)
    if los_y is not None:
        ax.axhline(y=los_y, color='yellow', linewidth=3, linestyle='-', alpha=0.9, zorder=9)
        ax.text(26.65, los_y + 1.0, "LOS",
                horizontalalignment='center',
                fontsize=14,
                color='yellow', fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='black', alpha=0.8),
                zorder=10)
    
    # Create contour plot (dominance regions)
    # Step 1: Receiver Influence PDF (6-yard radius)
    # Center is 2 yards in front of receiver, towards ball landing position
    x, y = np.mgrid[0:53.3:0.5, 0:120:0.5]
    field_grid = np.dstack((x, y))
    
    # Calculate direction vector from receiver to ball landing
    to_ball_x = ball_land_x - receiver_x
    to_ball_y = ball_land_y - receiver_y
    dist_to_ball = np.sqrt(to_ball_x**2 + to_ball_y**2)
    
    # Normalize direction vector and move center 2 yards forward
    if dist_to_ball > 0:
        unit_x = to_ball_x / dist_to_ball
        unit_y = to_ball_y / dist_to_ball
    else:
        # If ball is at receiver position, use default direction
        unit_x = 1.0
        unit_y = 0.0
    
    # Center of influence zone: 2 yards in front of receiver (towards ball)
    center_x = receiver_x + 2.0 * unit_x
    center_y = receiver_y + 2.0 * unit_y
    
    # Receiver PDF: 6-yard radius circular distribution, centered 2 yards in front
    receiver_pdf = multivariate_normal([center_y, center_x], [[6, 0], [0, 6]]).pdf(field_grid)
    
    # Step 2: Defender Pressure PDF - account for up to the 3 nearest defenders within 6 yards
    # Each of the 1st, 2nd, 3rd nearest defenders (if present) gets its own PDF, weighted by separation distance
    defender_pdf = np.zeros_like(receiver_pdf)
    defenders_counted = 0
    
    if all_players_frame is not None and len(all_players_frame) > 0:
        # Get all defenders - check multiple possible column names
        if 'player_side' in all_players_frame.columns:
            defenders = all_players_frame[all_players_frame['player_side'] == 'Defense'].copy()
        elif 'team' in all_players_frame.columns and 'possessionTeam' in all_players_frame.columns:
            # Alternative: defenders are those not on the possession team
            defenders = all_players_frame[all_players_frame['team'] != all_players_frame['possessionTeam']].copy()
        else:
            # Fallback: exclude receiver and other offensive players
            receiver_nfl_id = frame_row.get('nfl_id', frame_row.get('receiver_nfl_id', -1))
            defenders = all_players_frame[
                (all_players_frame['nfl_id'] != receiver_nfl_id) & 
                (all_players_frame.get('player_role', '') != 'Targeted Receiver')
            ].copy()
        
        if len(defenders) > 0:
            # Vectorised: compute separation to receiver for all defenders
            defenders = defenders.copy()
            defenders["def_x"] = defenders.get("x_std", defenders.get("x", np.nan))
            defenders["def_y"] = defenders.get("y_std", defenders.get("y", np.nan))
            defenders = defenders[
                defenders["def_x"].notna() &
                defenders["def_y"].notna()
            ]
            
            if len(defenders) > 0:
                defenders["separation"] = np.sqrt(
                    (defenders["def_x"] - receiver_x) ** 2 +
                    (defenders["def_y"] - receiver_y) ** 2
                )
                
                # Take up to the 3 nearest defenders
                defenders = defenders.sort_values("separation").head(3)
                
                for _, defender in defenders.iterrows():
                    separation = float(defender["separation"])
                    
                    # Only include defenders within 6 yards
                    if separation <= 6.0:
                        def_x = float(defender["def_x"])
                        def_y = float(defender["def_y"])
                        
                        # Weight function: w(s) = 1 / (1 + s/5) where s = separation in yards
                        # Closer defenders have more influence
                        def_weight = 1.0 / (1.0 + separation / 5.0)
                        
                        # Defender PDF: 4-yard radius, weighted by separation
                        # Each defender contributes its own pressure PDF
                        def_pdf = multivariate_normal(
                            [def_y, def_x],
                            [[4, 0], [0, 4]]
                        ).pdf(field_grid) * def_weight
                        
                        # Add to total defender pressure (sum of all defender PDFs)
                        defender_pdf = defender_pdf + def_pdf
                        defenders_counted += 1
    else:
        # Fallback: use nearest defender if all_players_frame not available
        if sep_nearest < np.inf and sep_nearest <= 6.0:
            def_weight = 1.0 / (1.0 + sep_nearest / 5.0)
            defender_pdf = multivariate_normal([def_y, def_x], [[4, 0], [0, 4]]).pdf(field_grid) * def_weight
    
    # Step 3: Dominance Ratio
    # Combine both PDFs
    total_pdf = receiver_pdf + defender_pdf + 1e-10  # epsilon to prevent division by zero
    
    # Dominance ratio: receiver influence / total influence
    # Values range from 0 to 1:
    # 1.0 = pure receiver influence (no defender pressure)
    # 0.0 = pure defender pressure (no receiver influence)
    dominance_pdf = receiver_pdf / total_pdf
    
    # Create mask for 6-yard radius around center (2 yards in front of receiver)
    # Calculate distance from center for each point on the grid
    distance_from_center = np.sqrt((x - center_y)**2 + (y - center_x)**2)
    mask_6_yards = distance_from_center <= 6.0
    
    # Apply mask to dominance PDF (set values outside 6 yards to NaN so they don't show)
    dominance_pdf_masked = np.where(mask_6_yards, dominance_pdf, np.nan)
    
    # Draw contour (Purple colormap like CPP) - only within 6 yards
    ax.contourf(x, y, dominance_pdf_masked, cmap='Purples', alpha=0.7, levels=15, zorder=1)
    
    # Plot ALL players if available (like CPP shows all players)
    if all_players_frame is not None and len(all_players_frame) > 0:
        # Plot all players on field
        for idx, player in all_players_frame.iterrows():
            player_x = player.get('x_std', player.get('x', 0))
            player_y = player.get('y_std', player.get('y', 0))
            player_nfl_id = player.get('nfl_id', 0)
            player_side = player.get('player_side', 'Unknown')
            player_pos = player.get('player_position', player.get('officialPosition', ''))
            jersey_num = player.get('jerseyNumber', player_nfl_id % 100)
            
            is_receiver = player_nfl_id == receiver_nfl_id
            
            if is_receiver:
                # Highlight receiver (green star - like QB in CPP)
                ax.scatter(player_y, player_x, color='limegreen', s=500,
                          marker='*', edgecolors='black', linewidths=3, zorder=10)
                ax.annotate(str(int(jersey_num)) if pd.notna(jersey_num) else 'WR', 
                           (player_y, player_x),
                           xytext=(player_y-0.5, player_x-0.5),
                           color='white', fontweight='bold', fontsize=12)
            elif player_side == 'Defense':
                # Defenders (blue circles - like away team in CPP)
                ax.scatter(player_y, player_x, color='blue', s=300,
                          edgecolors='white', linewidths=2, zorder=9)
                if pd.notna(jersey_num):
                    ax.annotate(str(int(jersey_num)), 
                               (player_y, player_x),
                               xytext=(player_y-0.5, player_x-0.5),
                               color='white', fontsize=10)
            else:
                # Other offensive players (red circles - like home team in CPP)
                ax.scatter(player_y, player_x, color='red', s=300,
                          edgecolors='white', linewidths=2, zorder=9)
                if pd.notna(jersey_num):
                    ax.annotate(str(int(jersey_num)), 
                               (player_y, player_x),
                               xytext=(player_y-0.5, player_x-0.5),
                               color='white', fontsize=10)
    else:
        # Fallback: Just plot receiver and nearest defender
        # Plot receiver (green star - like QB in CPP)
        ax.scatter(receiver_y, receiver_x, color='limegreen', s=500,
                  marker='*', edgecolors='black', linewidths=3, zorder=10)
        ax.annotate('WR', (receiver_y, receiver_x),
                   xytext=(receiver_y-1, receiver_x-1),
                   color='white', fontweight='bold', fontsize=14)
        
        # Plot defender (blue circle)
        ax.scatter(def_y, def_x, color='blue', s=400,
                  edgecolors='white', linewidths=2, zorder=9)
        ax.annotate('CB', (def_y, def_x),
                   xytext=(def_y-1, def_x-1),
                   color='white', fontsize=12)
    
    # Plot ball landing (yellow X)
    ax.scatter(ball_land_y, ball_land_x, color='yellow', s=600,
              marker='X', edgecolors='black', linewidths=3, zorder=8)
    
    # Draw line from receiver to ball
    ax.plot([receiver_y, ball_land_y], [receiver_x, ball_land_x],
           color='yellow', linewidth=2, linestyle='--', alpha=0.5, zorder=2)
    
    # Draw separation circle
    if sep_nearest < np.inf:
        circle = patches.Circle((receiver_y, receiver_x), sep_nearest,
                               fill=False, edgecolor='cyan', linewidth=2,
                               linestyle=':', alpha=0.7, zorder=3)
        ax.add_patch(circle)
    
    # Control score indicator
    dominance_percent = int(dominance_score * 100)
    
    if dominance_score >= 0.8:
        indicator_color = '#00FF00'  # Green
    elif dominance_score >= 0.65:
        indicator_color = '#FFFF00'  # Yellow
    elif dominance_score >= 0.5:
        indicator_color = '#FFA500'  # Orange
    else:
        indicator_color = '#FF0000'  # Red

    # Completion outcome (per play)
    outcome_raw = frame_row.get('is_complete', None)
    if outcome_raw is not None and not pd.isna(outcome_raw):
        outcome_label = "Complete" if int(outcome_raw) == 1 else "Incomplete"
    else:
        outcome_label = str(frame_row.get('pass_result', 'Unknown'))
    
    # Frame info
    frame_text = f"Frame {frame_num}/{total_frames}"
    ax.text(2, 115, frame_text,
           fontsize=16, fontweight='bold', color='white',
           bbox=dict(boxstyle='round', facecolor='black', alpha=0.7))
    
    # Control score (field control / ownership) + completion label
    ax.text(2, 112, f"Control Score: {dominance_percent}%   |   {outcome_label}",
           fontsize=22, fontweight='bold', color='white',
           bbox=dict(boxstyle='round,pad=1', facecolor=indicator_color,
                    alpha=0.8, edgecolor='black', linewidth=2))
    
    # Stats box
    stats_text = f"Sep: {sep_nearest:.1f}yd\nSpeed: {receiver_speed:.1f}yd/s"
    if not pd.isna(leverage_angle):
        stats_text += f"\nLeverage: {leverage_angle:.0f}°"
    
    ax.text(2, 5, stats_text,
           fontsize=12, fontweight='bold', color='white',
           bbox=dict(boxstyle='round', facecolor='black', alpha=0.7),
           verticalalignment='bottom')
    
    # Set axis
    ax.set_xlim(0, 53.3)
    ax.set_ylim(0, 120)
    ax.set_aspect('equal')
    plt.axis('off')
    
    return fig


# Quick usage function
def create_gif_for_play(
    training_df: pd.DataFrame,
    game_id: int,
    play_id: int,
    fps: int = 5,
    scorer: Optional[object] = None
) -> str:
    """
    Quick function to create GIF for a play
    
    Args:
        training_df: Your training dataframe
        game_id: Game ID
        play_id: Play ID
        fps: Frames per second
        scorer: RouteDominanceScorer object (optional - if provided, shows ALL players on field)
    
    Example:
        # With all players (recommended):
        from route_dominance_scoring import RouteDominanceScorer
        scorer = RouteDominanceScorer(input_df, output_df, supp_df)
        gif_path = create_gif_for_play(training_df, 2023090700, 101, scorer=scorer)
        
        # Without scorer (limited players):
        gif_path = create_gif_for_play(training_df, 2023090700, 101)
    """
    return create_dominance_gif_from_dataframe(
        training_df, game_id, play_id, fps=fps, scorer=scorer
    )


if __name__ == "__main__":
    print("""
    To create a GIF from your dataframe:
    
    from create_dominance_gif import create_gif_for_play
    
    # Load your dataframe
    training_df = pd.read_csv('route_dominance_training_data.csv')
    
    # Create GIF for a play
    gif_path = create_gif_for_play(training_df, 2023090700, 101)
    
    # The GIF will be saved in 'outputs/dominance_gifs/' folder
    """)





    To create a GIF from your dataframe:
    
    from create_dominance_gif import create_gif_for_play
    
    # Load your dataframe
    training_df = pd.read_csv('route_dominance_training_data.csv')
    
    # Create GIF for a play
    gif_path = create_gif_for_play(training_df, 2023090700, 101)
    
    # The GIF will be saved in 'outputs/dominance_gifs/' folder
    


In [3]:
#Field Control Visualization.py
"""
Advanced Field Control Visualization
------------------------------------

This module implements a more physics-based "field control" / influence model
around the targeted receiver, inspired by pitch-control style models.

Key ideas:
- Project players 0.5s into the future using their speed and direction
- Give each player an *elliptical* influence region aligned with movement
- Compute ownership at each grid point:
      ownership = I_receiver / (I_receiver + sum(I_defenders))

It is designed to work alongside `RouteDominanceScorer` by using
`scorer.all_frames_df` for full-frame player positions.
"""

from __future__ import annotations

import os
from typing import Optional, Tuple

import imageio
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import FancyArrowPatch, Rectangle
from scipy.stats import multivariate_normal



class FieldControlVisualizer:
    """
    Create advanced "field control" visualizations for a targeted receiver.

    This does NOT replace your existing GIF; it is an additional, more
    tactical view you can call from the notebook, e.g.:

        from field_control_visualizer import FieldControlVisualizer
        viz = FieldControlVisualizer(scorer)
        viz.create_advanced_gif(training_df, game_id=2023090700, play_id=101)
    """

    def __init__(self, scorer: RouteDominanceScorer):
        self.scorer = scorer

        # Red (defender control) -> Yellow (contested) -> Green (receiver control)
        colors = [(0.80, 0.20, 0.20), (0.95, 0.95, 0.30), (0.20, 0.80, 0.20)]
        self.cmap = LinearSegmentedColormap.from_list("FieldControlDominance", colors, N=100)

    @staticmethod
    def _safe_speed_dir(speed: float, direction: float) -> Tuple[float, float]:
        """Guard against NaNs in speed / dir."""
        if np.isnan(speed):
            speed = 0.0
        if np.isnan(direction):
            direction = 0.0
        return float(speed), float(direction)

    def _get_covariance_matrix(self, speed: float, dir_deg: float) -> np.ndarray:
        """
        Builds an oriented covariance matrix for an influence ellipse.

        Assumes the tracking `dir` angle is in degrees where:
        - 0°  = along +x_std
        - 90° = along +y_std
        (same convention used in RouteDominanceScorer for vx / vy).
        """
        speed, dir_deg = self._safe_speed_dir(speed, dir_deg)

        # Orientation angle in radians (measured from +x axis)
        theta = np.deg2rad(dir_deg)

        # Influence radius expands with speed:
        # base length 2 yards, grows up to ~4–5 yards at higher speeds
        long_axis = 2.0 + (speed / 3.0)
        long_axis = float(np.clip(long_axis, 1.5, 5.0))
        short_axis = 1.5  # width stays relatively tight

        # Rotation matrix
        c, s = np.cos(theta), np.sin(theta)
        R = np.array([[c, -s], [s, c]])

        # Scale matrix (ellipse axes^2)
        S = np.array([[long_axis**2, 0.0], [0.0, short_axis**2]])

        # Rotate covariance: R * S * R.T
        cov = R @ S @ R.T
        return cov

    def _draw_field(self, ax, los_x: Optional[float] = None) -> None:
        """
        Draw a *vertical* football field (like the CPP-style dominance GIF).

        Coordinate system for drawing:
            - x-axis: field width  (0 → 53.3 yards, sideline to sideline)
            - y-axis: field length (0 → 120 yards, end zone to end zone)

        Note: los_x is treated as the yardline along the *length* (y-axis).
        """
        ax.set_facecolor("#0d5f20")

        # Main field rectangle (width x length)
        field_rect = Rectangle(
            (0, 0),
            FIELD_WIDTH,
            FIELD_LENGTH,
            linewidth=1.0,
            edgecolor="white",
            facecolor="#0d5f20",
            zorder=0,
        )
        ax.add_patch(field_rect)

        # End zones (bottom and top)
        bottom_endzone = Rectangle(
            (0, 0),
            FIELD_WIDTH,
            10,
            linewidth=0.5,
            edgecolor="white",
            facecolor="#001f3f",
            alpha=0.4,
            zorder=0,
        )
        top_endzone = Rectangle(
            (0, FIELD_LENGTH - 10),
            FIELD_WIDTH,
            10,
            linewidth=0.5,
            edgecolor="white",
            facecolor="#001f3f",
            alpha=0.4,
            zorder=0,
        )
        ax.add_patch(bottom_endzone)
        ax.add_patch(top_endzone)

        # Yard lines every 5 yards, numbers every 10 (along the length/y-axis)
        for yard in range(10, int(FIELD_LENGTH - 10) + 1, 5):
            # Only label every 10 yards
            if yard % 10 == 0:
                # Standard NFL-style numbering: 10,20,30,40,50,40,30,20,10
                numb = yard
                if yard > 50:
                    numb = FIELD_LENGTH - yard  # mirror around midfield
                yard_number = int(numb - 10)

                # Thicker line at midfield (50)
                lw = 2.5 if yard_number == 50 else 1.5
            else:
                lw = 0.5

            ax.axhline(y=yard, color="white", linewidth=lw, alpha=0.3, zorder=0)

            if yard % 10 == 0:
                ax.text(
                    FIELD_WIDTH / 2,
                    yard,
                    str(yard_number),
                    ha="center",
                    va="center",
                    color="white",
                    fontsize=8,
                    fontweight="bold",
                    bbox=dict(boxstyle="round", facecolor="black", alpha=0.5),
                    zorder=1,
                )

        # Optional line of scrimmage (yellow horizontal line across the field)
        if los_x is not None:
            ax.axhline(
                y=los_x,
                color="yellow",
                linewidth=2.5,
                linestyle="-",
                alpha=0.9,
                zorder=1,
            )
            ax.text(
                FIELD_WIDTH / 2,
                los_x,
                "LOS",
                color="yellow",
                fontsize=8,
                fontweight="bold",
                ha="center",
                va="bottom",
                bbox=dict(boxstyle="round", facecolor="black", alpha=0.6),
                zorder=2,
            )

    def _calculate_influence_surface(
        self,
        frame_data,
        target_id: int,
        grid_size: int = 40,
        project_seconds: float = 0.5,
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Tuple[float, float]]:
        """
        Calculate the receiver-ownership surface for a single frame.

        Returns:
            X, Y: meshgrid arrays (field coordinates in x_std / y_std space)
            ownership_surface: 2D array of ownership probabilities [0, 1]
            rec_pos: (rec_x, rec_y) current receiver position (for labels)
        """
        if frame_data.empty:
            raise ValueError("Empty frame_data passed to _calculate_influence_surface")

        # Use the FULL field for the control surface so you see ownership
        # over the whole play, not just a moving square window.
        X_vals = np.linspace(0.0, FIELD_LENGTH, grid_size)
        Y_vals = np.linspace(0.0, FIELD_WIDTH, grid_size)
        X, Y = np.meshgrid(X_vals, Y_vals)
        pos = np.dstack((X, Y))

        # --- Receiver influence (numerator) ---
        rec_rows = frame_data[frame_data["nfl_id"] == target_id]
        if rec_rows.empty:
            raise ValueError(f"Target receiver {target_id} not found in frame_data")
        rec = rec_rows.iloc[0]

        rec_speed, rec_dir = self._safe_speed_dir(rec.get("s", 0.0), rec.get("dir", 0.0))
        rec_x = float(rec["x_std"])
        rec_y = float(rec["y_std"])

        # Project receiver 0.5s into the future using same convention as vx/vy
        dir_rad = np.deg2rad(rec_dir)
        vx = rec_speed * np.cos(dir_rad)
        vy = rec_speed * np.sin(dir_rad)
        rec_mu = np.array([rec_x + vx * project_seconds, rec_y + vy * project_seconds])

        rec_cov = self._get_covariance_matrix(rec_speed, rec_dir)
        rec_pdf = multivariate_normal(rec_mu, rec_cov).pdf(pos)

        # --- Defender influence (denominator sum) ---
        total_def_pdf = np.zeros_like(rec_pdf)

        if "player_side" in frame_data.columns:
            defenders = frame_data[frame_data["player_side"] == "Defense"]
        else:
            defenders = frame_data.iloc[0:0]  # empty fallback

        for _, defense in defenders.iterrows():
            def_speed, def_dir = self._safe_speed_dir(defense.get("s", 0.0), defense.get("dir", 0.0))
            def_x = float(defense["x_std"])
            def_y = float(defense["y_std"])

            dir_rad = np.deg2rad(def_dir)
            dvx = def_speed * np.cos(dir_rad)
            dvy = def_speed * np.sin(dir_rad)
            def_mu = np.array([def_x + dvx * project_seconds, def_y + dvy * project_seconds])

            def_cov = self._get_covariance_matrix(def_speed, def_dir)
            def_pdf = multivariate_normal(def_mu, def_cov).pdf(pos)

            total_def_pdf += def_pdf

        # --- Ownership ratio ---
        ownership_surface = rec_pdf / (rec_pdf + total_def_pdf + 1e-6)

        return X, Y, ownership_surface, (rec_x, rec_y)

    @staticmethod
    def _ownership_at_point(
        X: np.ndarray, Y: np.ndarray, ownership: np.ndarray, x0: float, y0: float
    ) -> float:
        """Get ownership value at the grid point closest to (x0, y0)."""
        # X, Y shape: (ny, nx)
        x_line = X[0, :]
        y_line = Y[:, 0]

        ix = int(np.argmin(np.abs(x_line - x0)))
        iy = int(np.argmin(np.abs(y_line - y0)))

        val = float(ownership[iy, ix])
        return val

    def create_advanced_gif(
        self,
        training_df,
        game_id: int,
        play_id: int,
        output_path: str = "outputs/advanced_field_control.gif",
        grid_size: int = 40,
        fps: int = 5,
        dominance_col: str = "field_control",
    ) -> str:
        """
        Create an "advanced" GIF showing field control for a single play.

        Args:
            training_df: Your training dataframe (used only to infer target receiver
                         and, optionally, show an external dominance score).
            game_id: Game ID
            play_id: Play ID
            output_path: Where to save the GIF
            grid_size: Resolution of the influence grid (higher = smoother, slower)
            fps: Frames per second for the GIF
            dominance_col: Column in training_df with your scalar dominance metric
                           (e.g. 'field_control' from RouteDominanceScorer; will
                           fall back to legacy columns if not present).
        """
        print(f"Generating advanced field-control GIF for Game {game_id}, Play {play_id}...")

        # 1. Determine targeted receiver ID from training_df
        play_rows = training_df[
            (training_df["game_id"] == game_id) & (training_df["play_id"] == play_id)
        ]
        if play_rows.empty:
            raise ValueError(f"Play {game_id}-{play_id} not found in training_df")

        target_id = int(play_rows["nfl_id"].iloc[0])

        # 2. Try to get line of scrimmage from supplementary data (yardline_100)
        los_x: Optional[float] = None
        supp = getattr(self.scorer, "supp_df", None)
        if supp is not None and "yardline_100" in supp.columns:
            ctx = supp[(supp["game_id"] == game_id) & (supp["play_id"] == play_id)]
            if not ctx.empty:
                yardline_100 = ctx.iloc[0]["yardline_100"]
                if not np.isnan(yardline_100):
                    # Standardized so offense always moves right:
                    # playable field is 10–110, so add 10 to yardline_100
                    los_x = 10.0 + float(yardline_100)

        # 3. Get all frames for this play from the scorer (all players)
        play_frames = self.scorer.all_frames_df[
            (self.scorer.all_frames_df["game_id"] == game_id)
            & (self.scorer.all_frames_df["play_id"] == play_id)
        ].copy()

        if play_frames.empty:
            raise ValueError(f"Play {game_id}-{play_id} not found in scorer.all_frames_df")

        # 3b. Use training_df's frames (e.g., ~47 frames) to drive the animation
        #     so this matches the main dominance GIF's timing.
        frame_rows = play_rows.copy()
        if "continuous_frame" in frame_rows.columns:
            frame_rows = frame_rows.sort_values("continuous_frame")
        elif "frame_id" in frame_rows.columns:
            frame_rows = frame_rows.sort_values("frame_id")

        frames_list = [
            (int(r["frame_id"]), str(r.get("frame_type", "input")))
            for _, r in frame_rows.iterrows()
        ]

        total_frames = len(frames_list)
        if total_frames == 0:
            raise ValueError(f"No frames available for Game {game_id}, Play {play_id}")

        # 4. Prepare output directory
        out_dir = os.path.dirname(output_path) or "."
        os.makedirs(out_dir, exist_ok=True)

        temp_files = []

        # 5. Loop through frames and render
        for idx, (frame_id, frame_type) in enumerate(frames_list, start=1):
            frame_data = play_frames[
                (play_frames["frame_id"] == frame_id)
                & (play_frames["frame_type"] == frame_type)
            ]
            if frame_data.empty:
                continue

            # Match the tall portrait style of the CPP-style dominance GIF
            fig, ax = plt.subplots(figsize=(10.66, 24))

            # --- Layer 0: Draw full football field background ---
            self._draw_field(ax, los_x=los_x)

            # --- Layer 1: Control surface ---
            X, Y, ownership, (rec_x, rec_y) = self._calculate_influence_surface(
                frame_data, target_id, grid_size=grid_size
            )

            # NOTE: X, Y are in (length, width) coordinates from _calculate_influence_surface.
            # For a vertical field (width on x-axis, length on y-axis), swap them.
            cf = ax.contourf(
                Y,   # width  (x-axis)
                X,   # length (y-axis)
                ownership,
                levels=20,
                cmap=self.cmap,
                alpha=0.65,
                vmin=0.0,
                vmax=1.0,
            )
            cbar = plt.colorbar(cf, ax=ax, fraction=0.025, pad=0.01)
            cbar.set_label("Receiver control (0 = defense, 1 = receiver)", fontsize=8)
            cbar.ax.tick_params(labelsize=7)

            # --- Layer 2: Players (ALL players on field) ---
            # We draw:
            #   - Targeted receiver: blue circle + arrow
            #   - Other offense: orange circles
            #   - Defense: red X marks
            rec_row = frame_data[frame_data["nfl_id"] == target_id].iloc[0]

            for _, player in frame_data.iterrows():
                # x_std = field length, y_std = field width
                length_coord = float(player["x_std"])
                width_coord = float(player["y_std"])
                px = width_coord   # horizontal
                py = length_coord  # vertical
                pid = int(player["nfl_id"])
                side = player.get("player_side", np.nan)

                is_receiver = pid == target_id
                if is_receiver:
                    ax.scatter(
                        px,
                        py,
                        c="blue",
                        edgecolors="white",
                        s=140,
                        marker="o",
                        label="Receiver",
                        zorder=6,
                    )
                else:
                    if side == "Defense":
                        ax.scatter(
                            px,
                            py,
                            c="red",
                            marker="x",
                            s=80,
                            label=None,
                            zorder=5,
                        )
                    else:
                        # Offense (non-target) – teammates
                        ax.scatter(
                            px,
                            py,
                            c="orange",
                            edgecolors="white",
                            s=90,
                            marker="o",
                            label=None,
                            zorder=5,
                        )

            # Velocity arrow for receiver
            rec_speed, rec_dir = self._safe_speed_dir(rec_row.get("s", 0.0), rec_row.get("dir", 0.0))
            dir_rad = np.deg2rad(rec_dir)
            vx = rec_speed * np.cos(dir_rad)
            vy = rec_speed * np.sin(dir_rad)
            arrow_scale = 0.3
            if rec_speed > 0.05:
                # Arrow in display (width, length) coordinates
                start_x = float(rec_row["y_std"])
                start_y = float(rec_row["x_std"])
                end_x = start_x + vy * arrow_scale  # vy maps to y-direction (length)
                end_y = start_y + vx * arrow_scale  # vx maps to x-direction (width)

                arrow = FancyArrowPatch(
                    (start_x, start_y),
                    (end_x, end_y),
                    arrowstyle="->",
                    mutation_scale=15,
                    linewidth=2.0,
                    color="white",
                    zorder=7,
                )
                ax.add_patch(arrow)

            # Axes cover the full vertical field; control surface sits on top
            ax.set_xlim(0, FIELD_WIDTH)
            ax.set_ylim(0, FIELD_LENGTH)
            ax.set_aspect("equal")

            # --- Layer 4: Dashboard / title ---
            # Prefer external dominance metric from training_df; otherwise use local field-control
            external_score = None

            # Allow fallback to legacy 'receiver_dominance' if requested column not present
            score_col = dominance_col
            if score_col not in play_rows.columns and "receiver_pressure" in play_rows.columns:
                score_col = "receiver_pressure"
            if score_col not in play_rows.columns and "receiver_dominance" in play_rows.columns:
                score_col = "receiver_dominance"

            if score_col in play_rows.columns:
                # Try to match by continuous_frame or frame_id if present
                frame_match_col = "continuous_frame" if "continuous_frame" in play_rows.columns else "frame_id"
                if frame_match_col in play_rows.columns and len(play_rows) >= idx:
                    external_score = float(play_rows[score_col].iloc[idx - 1])

            local_control = self._ownership_at_point(X, Y, ownership, rec_x, rec_y)

            # Derive completion label (per play)
            outcome_raw = play_rows.iloc[0].get("is_complete", None)
            if outcome_raw is not None and not np.isnan(outcome_raw):
                outcome_label = "Complete" if int(outcome_raw) == 1 else "Incomplete"
            else:
                outcome_label = str(play_rows.iloc[0].get("pass_result", "Unknown"))

            title = f"Game {game_id}, Play {play_id} | Frame {idx}/{total_frames} ({frame_type})"
            if external_score is not None:
                subtitle = f"Control Score (training_df): {external_score:.2f}   |   {outcome_label}"
            else:
                subtitle = f"Receiver Field Control: {local_control:.2f}   |   {outcome_label}"
            ax.set_title(f"{title}\n{subtitle}", fontsize=12, fontweight="bold")

            ax.legend(loc="upper right", fontsize=8)
            ax.set_xlabel("Field X (yards)")
            ax.set_ylabel("Field Y (yards)")

            fig.tight_layout()

            frame_file = os.path.join(out_dir, f"_fc_frame_{idx:03d}.png")
            fig.savefig(frame_file, dpi=100)
            plt.close(fig)
            temp_files.append(frame_file)

        # 6. Assemble GIF
        print(f"Combining {len(temp_files)} frames into GIF at {output_path} ...")
        with imageio.get_writer(output_path, mode="I", duration=1.0 / max(fps, 1)) as writer:
            for fname in temp_files:
                img = imageio.imread(fname)
                writer.append_data(img)

        # 7. Cleanup temporary PNGs
        for fname in temp_files:
            try:
                os.remove(fname)
            except OSError:
                pass

        print(f"✓ Advanced field-control GIF saved: {output_path}")
        return output_path




In [4]:
#route_dominance_scoring.py
"""
Route Dominance Scoring System for NFL Plays

This module provides:
1. Frame-by-frame dominance scoring
2. Route-level aggregation
3. Interactive/animated visualizations
4. LSTM-based prediction approach
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.patches import Circle, Rectangle
from scipy.stats import multivariate_normal
try:
    import seaborn as sns
except ImportError:
    sns = None  # Optional dependency
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Field constants
FIELD_LENGTH = 120.0
FIELD_WIDTH = 53.3
FRAME_RATE = 10  # frames per second (NFL tracking data)


class RouteDominanceScorer:
    """Calculate frame-by-frame and route-level dominance scores"""
    
    def __init__(self, input_df: pd.DataFrame, output_df: pd.DataFrame, 
                 supp_df: pd.DataFrame):
        """
        Initialize with dataframes
        
        Args:
            input_df: Pre-throw tracking data (all frames before throw)
            output_df: Post-throw tracking data (ball in flight)
            supp_df: Supplementary play context data
        """
        self.input_df = input_df.copy()
        self.output_df = output_df.copy()
        self.supp_df = supp_df.copy()
        
        # Standardize coordinates
        self._standardize_coordinates()
        self._standardize_reciever_side()
        
        # Combine all frames
        self.all_frames_df = self._combine_all_frames()
        
    def _standardize_coordinates(self):
        """Standardize coordinates so offense always drives right"""
        # Standardize input data
        right_mask = self.input_df["play_direction"].str.lower() == "right"
        self.input_df["x_std"] = self.input_df["x"]
        self.input_df["y_std"] = self.input_df["y"]
        self.input_df.loc[~right_mask, "x_std"] = FIELD_LENGTH - self.input_df.loc[~right_mask, "x"]
        self.input_df.loc[~right_mask, "y_std"] = FIELD_WIDTH - self.input_df.loc[~right_mask, "y"]
        
        # Standardize ball landing
        self.input_df["ball_land_x_std"] = self.input_df["ball_land_x"]
        self.input_df["ball_land_y_std"] = self.input_df["ball_land_y"]
        self.input_df.loc[~right_mask, "ball_land_x_std"] = FIELD_LENGTH - self.input_df.loc[~right_mask, "ball_land_x"]
        self.input_df.loc[~right_mask, "ball_land_y_std"] = FIELD_WIDTH - self.input_df.loc[~right_mask, "ball_land_y"]
        
        # Get play direction mapping for output
        play_dir_map = self.input_df[["game_id", "play_id", "play_direction"]].drop_duplicates()
        self.output_df = self.output_df.merge(play_dir_map, on=["game_id", "play_id"], how="left")
        
        # Standardize output data
        right_mask_out = self.output_df["play_direction"].str.lower() == "right"
        self.output_df["x_std"] = self.output_df["x"]
        self.output_df["y_std"] = self.output_df["y"]
        self.output_df.loc[~right_mask_out, "x_std"] = FIELD_LENGTH - self.output_df.loc[~right_mask_out, "x"]
        self.output_df.loc[~right_mask_out, "y_std"] = FIELD_WIDTH - self.output_df.loc[~right_mask_out, "y"]
        
        # Add velocity components
        self.input_df["vx"] = self.input_df["s"] * np.cos(np.deg2rad(self.input_df["dir"].fillna(0)))
        self.input_df["vy"] = self.input_df["s"] * np.sin(np.deg2rad(self.input_df["dir"].fillna(0)))

    def _standardize_reciever_side(self):
        """Flip coordinates of players so reciever always above qb (on the left side from qb's perspective)"""
        #Find what side the reciever is on
        qb_pos = self.input_df[(self.input_df["player_role"] == "Passer") & (self.input_df['frame_id'] == 1)][['y_std', 'play_id', 'game_id']]
        wr_pos = self.input_df[(self.input_df["player_role"] == "Targeted Receiver") & (self.input_df['frame_id'] == 1)][['y_std', 'play_id', 'game_id']]
        merged_pos = pd.merge(qb_pos, wr_pos, on=['play_id', 'game_id'], how='left')
        merged_pos['receiver_side'] = np.where(merged_pos['y_std_x'] < merged_pos['y_std_y'], 'left', 'right')
        
        # Merge receiver_side into input_df, output_df, and supp_df based on game_id and play_id
        receiver_side_df = merged_pos[['game_id', 'play_id', 'receiver_side']]
        self.supp_df = self.supp_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')
        self.input_df = self.input_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')
        self.output_df = self.output_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')

        #Flip the field of inputs, outputs and ball landing when reciever aligns to right of qb
        self.input_df.loc[self.input_df['receiver_side'] == 'right', 'y_std'] = FIELD_WIDTH - self.input_df.loc[self.input_df['receiver_side'] == 'right', 'y_std']
        self.output_df.loc[self.output_df['receiver_side'] == 'right', 'y_std'] = FIELD_WIDTH - self.output_df.loc[self.output_df['receiver_side'] == 'right', 'y_std']
        self.input_df.loc[self.input_df['receiver_side'] == 'right', 'ball_land_y_std'] = FIELD_WIDTH - self.input_df.loc[self.input_df['receiver_side'] == 'right', 'ball_land_y_std']
    
    def _combine_all_frames(self) -> pd.DataFrame:
        """
        Combine input and output frames to get complete play sequence
        
        Returns:
            DataFrame with all frames for each play
        """
        # Get metadata from input (player info, ball landing, etc.)
        input_meta = self.input_df[[
            "game_id", "play_id", "nfl_id", "player_name", "player_position",
            "player_side", "player_role", "ball_land_x_std", "ball_land_y_std",
            "num_frames_output"
        ]].drop_duplicates()
        
        # Prepare input frames (pre-throw)
        input_frames = self.input_df[[
            "game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std",
            "s", "a", "dir", "vx", "vy"
        ]].copy()
        input_frames["frame_type"] = "input"
        
        # Prepare output frames (ball in flight)
        # Need to map frame_id: output frames start at 1, but they correspond to
        # frames after the throw. We'll offset them
        output_frames = self.output_df[[
            "game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std"
        ]].copy()
        
        # Merge with input to get speed/acceleration if available
        # For output frames, we'll need to estimate or use previous values
        output_frames = output_frames.merge(
            input_meta, on=["game_id", "play_id", "nfl_id"], how="left"
        )
        
        # Get the last input frame for each player to estimate velocity
        last_input = input_frames.sort_values("frame_id").groupby(
            ["game_id", "play_id", "nfl_id"]
        ).last().reset_index()
        
        # For output frames, estimate speed from position changes
        output_frames = output_frames.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Initialize speed/acceleration columns
        output_frames["s"] = np.nan
        output_frames["a"] = np.nan
        output_frames["dir"] = np.nan
        output_frames["vx"] = np.nan
        output_frames["vy"] = np.nan
        
        # Calculate speed from position changes
        for (gid, pid, nid), group in output_frames.groupby(["game_id", "play_id", "nfl_id"]):
            group = group.sort_values("frame_id")
            
            # Get last input frame for this player to use as starting point
            last_input_row = last_input[
                (last_input["game_id"] == gid) &
                (last_input["play_id"] == pid) &
                (last_input["nfl_id"] == nid)
            ]
            
            # Calculate position changes
            group["prev_x"] = group["x_std"].shift(1)
            group["prev_y"] = group["y_std"].shift(1)
            
            # For first output frame, use last input position
            if not last_input_row.empty:
                first_idx = group.index[0]
                output_frames.loc[first_idx, "prev_x"] = last_input_row.iloc[0]["x_std"]
                output_frames.loc[first_idx, "prev_y"] = last_input_row.iloc[0]["y_std"]
            
            # Calculate displacement
            dx = group["x_std"] - group["prev_x"]
            dy = group["y_std"] - group["prev_y"]
            
            # Calculate speed (yards per second)
            # Frame rate is 10 fps, so multiply by 10 to get yards/second
            speed = np.sqrt(dx**2 + dy**2) * FRAME_RATE
            direction = np.rad2deg(np.arctan2(dy, dx))
            
            # Calculate acceleration (change in speed)
            prev_speed = speed.shift(1)
            if not last_input_row.empty:
                prev_speed.iloc[0] = last_input_row.iloc[0]["s"]
            acceleration = (speed - prev_speed) * FRAME_RATE
            
            # Calculate velocity components
            vx = speed * np.cos(np.deg2rad(direction))
            vy = speed * np.sin(np.deg2rad(direction))
            
            # Fill NaN values with last known values
            speed = speed.ffill().fillna(0)
            direction = direction.ffill().fillna(0)
            acceleration = acceleration.fillna(0)
            vx = vx.fillna(0)
            vy = vy.fillna(0)
            
            # Special handling for first output frame: use average of last input and second output
            if len(group) > 1 and not last_input_row.empty:
                first_idx = group.index[0]  # First output frame
                second_idx = group.index[1]  # Second output frame
                
                last_input_speed = last_input_row.iloc[0]["s"]
                second_output_speed = speed.loc[second_idx]
                
                # Average the speeds for the first output frame
                if not np.isnan(second_output_speed) and not np.isnan(last_input_speed):
                    avg_speed = (last_input_speed + second_output_speed) / 2.0
                    speed.loc[first_idx] = avg_speed
                    
                    # Recalculate direction and velocity for first frame using averaged speed
                    first_dx = group.loc[first_idx, "x_std"] - group.loc[first_idx, "prev_x"]
                    first_dy = group.loc[first_idx, "y_std"] - group.loc[first_idx, "prev_y"]
                    first_dir = np.rad2deg(np.arctan2(first_dy, first_dx))
                    direction.loc[first_idx] = first_dir
                    vx.loc[first_idx] = avg_speed * np.cos(np.deg2rad(first_dir))
                    vy.loc[first_idx] = avg_speed * np.sin(np.deg2rad(first_dir))
                    
                    # Recalculate acceleration for first frame
                    acceleration.loc[first_idx] = (avg_speed - last_input_speed) * FRAME_RATE
            
            # Update output_frames
            for idx in group.index:
                output_frames.loc[idx, "s"] = speed.loc[idx]
                output_frames.loc[idx, "dir"] = direction.loc[idx]
                output_frames.loc[idx, "a"] = acceleration.loc[idx]
                output_frames.loc[idx, "vx"] = vx.loc[idx]
                output_frames.loc[idx, "vy"] = vy.loc[idx]
        
        # Clean up temporary columns
        if "prev_x" in output_frames.columns:
            output_frames = output_frames.drop(columns=["prev_x", "prev_y"])
        output_frames["frame_type"] = "output"
        
        # Combine input and output
        input_cols = ["game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std", 
                     "s", "a", "dir", "vx", "vy", "frame_type"]
        output_cols = ["game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std",
                      "s", "a", "dir", "vx", "vy", "frame_type"]
        
        all_frames = pd.concat([
            input_frames[input_cols],
            output_frames[output_cols]
        ], ignore_index=True)
        
        # Add metadata
        all_frames = all_frames.merge(input_meta, on=["game_id", "play_id", "nfl_id"], how="left")
        
        # Sort by frame_id
        all_frames = all_frames.sort_values(["game_id", "play_id", "frame_id", "nfl_id"])
        
        return all_frames
    
    def _detect_route_break(self, play_frames: pd.DataFrame) -> int:
        """
        Identify the frame_id where the route 'break' (hard cut) occurs for a receiver.
        Returns 0 if no distinct break is found (e.g., Go / Seam route).
        """
        # Consider only input frames (route running phase), ordered by frame_id
        route_frames = play_frames[play_frames["frame_type"] == "input"].sort_values("frame_id")
        if len(route_frames) < 5:
            return 0

        # 1. Get directions and fill NaNs forward, then with 0
        dirs = route_frames["dir"].fillna(method="ffill").fillna(0).values

        # 2. Direction change between consecutive frames (delta angle)
        diffs = np.abs(dirs[1:] - dirs[:-1])
        # Handle wrap-around at 360° (e.g. 350 -> 10 should be 20°, not 340°)
        diffs = np.where(diffs > 180.0, 360.0 - diffs, diffs)

        # 3. Smooth with a running average over 3 frames
        smooth_diffs = np.convolve(diffs, np.ones(3) / 3.0, mode="valid")
        if len(smooth_diffs) == 0:
            return 0

        # 4. Find the sharpest turn
        max_idx = int(np.argmax(smooth_diffs))
        max_val = float(smooth_diffs[max_idx])

        # Threshold: treat <15° as "no break" (Go / Seam)
        if max_val < 15.0:
            return 0

        # 5. Map back to frame index (account for diff + convolution offset)
        break_frame_index = max_idx + 1
        if break_frame_index >= len(route_frames):
            return 0

        return int(route_frames.iloc[break_frame_index]["frame_id"])
    
    def calculate_frame_dominance(self, game_id: int, play_id: int, 
                                   target_nfl_id: int) -> pd.DataFrame:
        """
        Calculate dominance score for each frame of a play
        
        Args:
            game_id: Game identifier
            play_id: Play identifier
            target_nfl_id: NFL ID of targeted receiver
            
        Returns:
            DataFrame with frame-by-frame dominance metrics
        """
        # Get all frames for this play
        play_frames = self.all_frames_df[
            (self.all_frames_df["game_id"] == game_id) &
            (self.all_frames_df["play_id"] == play_id)
        ].copy()
        
        if play_frames.empty:
            raise ValueError(f"Play {game_id}-{play_id} not found")
        
        # Get targeted receiver frames
        target_frames = play_frames[play_frames["nfl_id"] == target_nfl_id].copy()
        
        if target_frames.empty:
            raise ValueError(f"Targeted receiver {target_nfl_id} not found in play")
        
        # NEW: Detect route break (once per play, for this receiver)
        break_frame_id = self._detect_route_break(target_frames)
        
        # Get max input frame_id to build a continuous frame index across input/output
        input_subset = play_frames[play_frames["frame_type"] == "input"]
        max_input_id = int(input_subset["frame_id"].max()) if not input_subset.empty else 0

        # =================================================================
        # NEW: CALCULATE CONTEXT AT SNAP (Frame 1) - MATCHING ECP LOGIC
        # =================================================================
        # 1. Get positions at Frame 1 (the snap) using input frames
        frame_1_data = play_frames[
            (play_frames["frame_id"] == 1) &
            (play_frames["frame_type"] == "input")
        ]
        target_f1 = frame_1_data[frame_1_data["nfl_id"] == target_nfl_id]
        
        # Default values (if missing data)
        is_press = 0
        shade_encoded = 0 
        
        if not target_f1.empty and not frame_1_data.empty:
            # Receiver position at snap
            rec_y = target_f1.iloc[0]["y_std"]
            rec_x = target_f1.iloc[0]["x_std"]
            
            # Find the nearest defender at snap
            defenders_f1 = frame_1_data[frame_1_data["player_side"] == "Defense"]
            
            if not defenders_f1.empty:
                # Calculate distances to receiver
                dists = np.sqrt(
                    (defenders_f1["x_std"] - rec_x) ** 2 +
                    (defenders_f1["y_std"] - rec_y) ** 2
                )
                
                # Get nearest defender details
                nearest_idx = dists.idxmin()
                nearest_def_f1 = defenders_f1.loc[nearest_idx]
                min_dist = float(dists.min())
                def_y = nearest_def_f1["y_std"]
                
                # --- A. PRESS LOGIC (From ECP Script) ---
                # Threshold: < 5.0 yards
                is_press = 1 if min_dist < 5.0 else 0
                    
                # --- B. SHADE LOGIC (From ECP Script) ---
                # Formula: abs(RecY - Center) - abs(DefY - Center)
                # If Rec is further from center than Def -> Def is Inside -> Positive Score
                shade_score = abs(rec_y - 26.65) - abs(def_y - 26.65)
                
                if shade_score > 0.5:
                    shade_encoded = 1   # Inside Shade
                elif shade_score < -0.5:
                    shade_encoded = -1  # Outside Shade
                else:
                    shade_encoded = 0   # Head-Up
        # =================================================================
        
        # Get ball landing coordinates
        ball_land_x = target_frames["ball_land_x_std"].iloc[0]
        ball_land_y = target_frames["ball_land_y_std"].iloc[0]
        num_frames_output = target_frames["num_frames_output"].iloc[0]
        
        # Get play context
        supp_row = self.supp_df[
            (self.supp_df["game_id"] == game_id) &
            (self.supp_df["play_id"] == play_id)
        ]
        
        if supp_row.empty:
            route = "UNKNOWN"
            pass_result = "UNKNOWN"
        else:
            route = supp_row.iloc[0].get("route_of_targeted_receiver", "UNKNOWN")
            pass_result = supp_row.iloc[0].get("pass_result", "UNKNOWN")
        
        # Calculate metrics for each frame
        # Need to iterate through both input and output frames separately
        # because they have overlapping frame_ids (both start at 1)
        frame_metrics = []
        
        # Get input and output frames separately
        input_target_frames = target_frames[target_frames["frame_type"] == "input"].sort_values("frame_id")
        output_target_frames = target_frames[target_frames["frame_type"] == "output"].sort_values("frame_id")
        
        # Helper function to process a single frame
        def process_frame(target_frame_row, frame_type):
            frame_id = target_frame_row["frame_id"]
            target_frame = target_frame_row
            
            # Get all players at this frame (must match both frame_id AND frame_type)
            # This ensures we get all players from the output file for output frames
            frame_players = play_frames[
                (play_frames["frame_id"] == frame_id) & 
                (play_frames["frame_type"] == frame_type)
            ]
            
            # Get defenders
            defenders = frame_players[frame_players["player_side"] == "Defense"]
            
            # 1. Separation from nearest defenders (1st, 2nd, 3rd)
            def_dists = None
            nearest_def_indices = None
            
            if not defenders.empty:
                def_dists = np.sqrt(
                    (defenders["x_std"] - target_frame["x_std"])**2 +
                    (defenders["y_std"] - target_frame["y_std"])**2
                )
                
                # Sort defenders by separation to get 1st, 2nd, 3rd nearest
                sorted_dists = def_dists.sort_values()
                nearest_def_indices = sorted_dists.index[:3]
                
                sep_nearest = sorted_dists.iloc[0] if len(sorted_dists) >= 1 else np.inf
                sep_second = sorted_dists.iloc[1] if len(sorted_dists) >= 2 else np.nan
                sep_third = sorted_dists.iloc[2] if len(sorted_dists) >= 3 else np.nan
                
                num_def_within_2 = (def_dists <= 2.0).sum()
                num_def_within_3 = (def_dists <= 3.0).sum()
                num_def_within_5 = (def_dists <= 5.0).sum()
            else:
                sep_nearest = np.inf
                sep_second = np.nan
                sep_third = np.nan
                num_def_within_2 = 0
                num_def_within_3 = 0
                num_def_within_5 = 0
            
            # 2. Receiver speed and acceleration
            receiver_speed = target_frame["s"]
            receiver_accel = target_frame["a"]
            
            # 3. Distance to ball landing spot
            dist_to_ball = np.sqrt(
                (target_frame["x_std"] - ball_land_x)**2 +
                (target_frame["y_std"] - ball_land_y)**2
            )
            
            # 4. Leverage Angle - angle between defender-to-receiver and receiver-to-ball vectors
            leverage_angle = np.nan
            nearest_def = None
            if not defenders.empty and sep_nearest < np.inf:
                nearest_def = defenders.loc[def_dists.idxmin()]
                
                # Vector from nearest defender to receiver
                def_to_rec_x = target_frame["x_std"] - nearest_def["x_std"]
                def_to_rec_y = target_frame["y_std"] - nearest_def["y_std"]
                
                # Vector from receiver to ball landing
                rec_to_ball_x = ball_land_x - target_frame["x_std"]
                rec_to_ball_y = ball_land_y - target_frame["y_std"]
                
                # Calculate angle between vectors using dot product
                # angle = arccos((v1 · v2) / (||v1|| * ||v2||))
                dot_product = def_to_rec_x * rec_to_ball_x + def_to_rec_y * rec_to_ball_y
                mag_def_to_rec = np.sqrt(def_to_rec_x**2 + def_to_rec_y**2)
                mag_rec_to_ball = np.sqrt(rec_to_ball_x**2 + rec_to_ball_y**2)
                
                if mag_def_to_rec > 0 and mag_rec_to_ball > 0:
                    cos_angle = dot_product / (mag_def_to_rec * mag_rec_to_ball)
                    # Clamp to [-1, 1] to avoid numerical errors
                    cos_angle = np.clip(cos_angle, -1.0, 1.0)
                    angle_rad = np.arccos(cos_angle)
                    angle_deg = np.rad2deg(angle_rad)
                    
                    # Normalize to always be the smaller angle (≤ 180 degrees)
                    # arccos gives [0, π] which is [0, 180°], so we take the smaller of angle and 180-angle
                    leverage_angle = min(angle_deg, 180.0 - angle_deg)
                else:
                    leverage_angle = np.nan

            # 4b. Relative velocity angle (cosine similarity of movement directions)
            #     between the receiver and the nearest defender.
            relative_velocity_angle = np.nan
            rec_dir_val = target_frame.get("dir", np.nan)
            if nearest_def is not None and not np.isnan(rec_dir_val):
                def_dir_val = nearest_def.get("dir", np.nan)
                if not np.isnan(def_dir_val):
                    rec_rad = np.deg2rad(rec_dir_val)
                    def_rad = np.deg2rad(def_dir_val)
                    relative_velocity_angle = float(np.cos(rec_rad - def_rad))
            
            # 5. Time-to-ball metrics and closing-time-based pressure score
            # Receiver time to ball
            to_ball_x = ball_land_x - target_frame["x_std"]
            to_ball_y = ball_land_y - target_frame["y_std"]
            to_ball_dist = np.sqrt(to_ball_x**2 + to_ball_y**2)
            if to_ball_dist > 0 and not np.isnan(target_frame.get("vx", np.nan)):
                rec_vx = float(target_frame["vx"])
                rec_vy = float(target_frame["vy"])
                to_ball_unit_x = to_ball_x / to_ball_dist
                to_ball_unit_y = to_ball_y / to_ball_dist
                speed_toward_ball = rec_vx * to_ball_unit_x + rec_vy * to_ball_unit_y
                if speed_toward_ball > 0:
                    time_to_ball = to_ball_dist / max(speed_toward_ball, 1e-6)
                else:
                    time_to_ball = np.inf
            else:
                time_to_ball = np.inf

            # Defender time to ball for nearest defender (if any)
            if nearest_def is not None:
                def_to_ball_x = float(ball_land_x - nearest_def["x_std"])
                def_to_ball_y = float(ball_land_y - nearest_def["y_std"])
                def_to_ball_dist = np.sqrt(def_to_ball_x**2 + def_to_ball_y**2)
                def_speed = float(nearest_def.get("s", 0.0))
                def_dir = float(nearest_def.get("dir", 0.0)) if not np.isnan(
                    nearest_def.get("dir", np.nan)
                ) else 0.0

                if def_to_ball_dist > 0 and def_speed > 0:
                    def_vx = float(nearest_def.get("vx", def_speed * np.cos(np.deg2rad(def_dir))))
                    def_vy = float(nearest_def.get("vy", def_speed * np.sin(np.deg2rad(def_dir))))
                    def_to_ball_unit_x = def_to_ball_x / def_to_ball_dist
                    def_to_ball_unit_y = def_to_ball_y / def_to_ball_dist
                    def_speed_toward_ball = def_vx * def_to_ball_unit_x + def_vy * def_to_ball_unit_y
                    if def_speed_toward_ball > 0:
                        def_time_to_ball = def_to_ball_dist / max(def_speed_toward_ball, 1e-6)
                    else:
                        def_time_to_ball = np.inf
                else:
                    def_time_to_ball = np.inf
            else:
                def_time_to_ball = np.inf

            # Time advantage: defender time minus receiver time (positive = defender later)
            if np.isfinite(time_to_ball) and np.isfinite(def_time_to_ball):
                time_advantage = def_time_to_ball - time_to_ball
            else:
                time_advantage = np.nan

            # Closing time for pressure: how long until defender arrives after receiver (0 = already closed)
            if np.isfinite(time_to_ball) and np.isfinite(def_time_to_ball):
                closing_time = max(0.0, def_time_to_ball - time_to_ball)
            else:
                closing_time = np.inf

            # Map closing_time to a 0–100 pressure score (higher = more defensive pressure)
            if not np.isfinite(closing_time):
                pressure_score = 0
            else:
                # Exponential decay: 0s -> ~100, 0.7s -> ~50, 3s -> ~5
                pressure_score = int(np.clip(100.0 * np.exp(-1.0 * closing_time), 0.0, 100.0))
            
            # 6. Initial Leverage (angle advantage) - calculated at start of route
            # Leverage: is the defender between the receiver and the ball?
            if frame_id == 1 and not defenders.empty:
                # Get initial positions
                target_start_x = target_frame["x_std"]
                target_start_y = target_frame["y_std"]
                
                # Vector from target to ball
                target_to_ball_x = ball_land_x - target_start_x
                target_to_ball_y = ball_land_y - target_start_y
                
                leverage_scores = []
                for _, def_row in defenders.iterrows():
                    # Vector from target to defender
                    target_to_def_x = def_row["x_std"] - target_start_x
                    target_to_def_y = def_row["y_std"] - target_start_y
                    
                    # Check if defender is "between" target and ball
                    # Using dot product to check alignment
                    dot_product = (target_to_def_x * target_to_ball_x + 
                                 target_to_def_y * target_to_ball_y)
                    
                    if dot_product > 0:  # Defender is in front (bad leverage)
                        leverage = -1.0
                    else:  # Defender is behind (good leverage)
                        leverage = 1.0
                    
                    leverage_scores.append(leverage)
                
                initial_leverage = np.mean(leverage_scores) if leverage_scores else 0.0
            else:
                initial_leverage = np.nan if frame_id == 1 else None
            
            # 6. Calculate Receiver Pressure using multivariate normal PDFs
            # Use up to the 3 nearest defenders within 6 yards
            
            # Step 1: Receiver Influence PDF (6-yard radius)
            # Center is 2 yards in front of receiver, towards ball landing position
            receiver_x = target_frame["x_std"]
            receiver_y = target_frame["y_std"]
            
            # Calculate direction vector from receiver to ball landing
            to_ball_x = ball_land_x - receiver_x
            to_ball_y = ball_land_y - receiver_y
            dist_to_ball = np.sqrt(to_ball_x**2 + to_ball_y**2)
            
            # Normalize direction vector and move center 2 yards forward
            if dist_to_ball > 0:
                unit_x = to_ball_x / dist_to_ball
                unit_y = to_ball_y / dist_to_ball
            else:
                # If ball is at receiver position, use default direction
                unit_x = 1.0
                unit_y = 0.0
            
            # Center of influence zone: 2 yards in front of receiver (towards ball)
            center_x = receiver_x + 2.0 * unit_x
            center_y = receiver_y + 2.0 * unit_y
            
            # Create field grid for PDF calculation
            x_grid, y_grid = np.mgrid[0:53.3:0.5, 0:120:0.5]
            locations = np.dstack((x_grid, y_grid))
            
            # Receiver PDF: 6-yard radius circular distribution, centered 2 yards in front
            receiver_pdf = multivariate_normal([center_y, center_x], [[6, 0], [0, 6]]).pdf(locations)
            
            # Step 2: Defender Pressure PDF - sum up to the 3 nearest defenders within 6 yards
            # Each of the 1st, 2nd, 3rd nearest defenders (if present) gets its own PDF,
            # weighted by separation distance
            defender_pdf = np.zeros_like(receiver_pdf)
            
            if def_dists is not None and nearest_def_indices is not None:
                for def_idx in nearest_def_indices:
                    def_row = defenders.loc[def_idx]
                    def_x = def_row["x_std"]
                    def_y = def_row["y_std"]
                    
                    # Separation for this specific defender
                    separation = float(def_dists.loc[def_idx])
                    
                    # Only include defenders within 6 yards
                    if separation <= 6.0:
                        # Weight function: w(s) = 1 / (1 + s/5) where s = separation in yards
                        # Closer defenders have more influence
                        def_weight = 1.0 / (1.0 + separation / 5.0)
                        
                        # Defender PDF: 4-yard radius, weighted by separation
                        # Each defender contributes its own pressure PDF
                        def_pdf = multivariate_normal([def_y, def_x], [[4, 0], [0, 4]]).pdf(locations) * def_weight
                        
                        # Add to total defender pressure (sum of all defender PDFs)
                        defender_pdf = defender_pdf + def_pdf
            
            # Step 3: Dominance Ratio
            # Combine both PDFs
            total_pdf = receiver_pdf + defender_pdf + 1e-10  # epsilon to prevent division by zero
            
            # Dominance ratio: receiver influence / total influence
            # Values range from 0 to 1:
            # 1.0 = pure receiver influence (no defender pressure)
            # 0.0 = pure defender pressure (no receiver influence)
            dominance_pdf = receiver_pdf / total_pdf
            
            # Step 4: Final Receiver Pressure Score
            # Focus on the receiver's immediate area (6-yard radius, centered 2 yards in front)
            receiver_area_pdf = multivariate_normal([center_y, center_x], [[6, 0], [0, 6]]).pdf(locations)
            
            # Weight dominance by receiver area to focus on receiver's immediate area
            weighted_dominance = receiver_area_pdf * dominance_pdf
            
            # Calculate average dominance in receiver's area
            receiver_pressure = np.sum(weighted_dominance) / np.sum(receiver_area_pdf)
            
            # Step 5: Normalization
            # Normalize: (val - 0.5) / (0.8 - 0.5)
            receiver_pressure = (receiver_pressure - 0.50) / (0.80 - 0.50)
            
            # Clamp to [0, 1]
            receiver_pressure = max(0.0, min(1.0, receiver_pressure))

            # 7. Spatial ownership / field control at projected receiver spot
            #    (Physics-based control score, 0-1)
            rec_pos = np.array([receiver_x, receiver_y])
            rec_speed_val = float(receiver_speed) if not pd.isna(receiver_speed) else 0.0
            rec_dir_val = target_frame.get("dir", 0.0)
            if pd.isna(rec_dir_val):
                rec_dir_val = 0.0
            rec_angle = np.radians(90.0 - rec_dir_val)

            # Project receiver 0.5s into the future
            rec_mu = rec_pos + np.array(
                [rec_speed_val * np.sin(rec_angle), rec_speed_val * np.cos(rec_angle)]
            ) * 0.5

            # Receiver influence ellipse
            long_axis = 2.0 + (rec_speed_val / 4.0)
            short_axis = 1.5
            c_rc, s_rc = np.cos(rec_angle), np.sin(rec_angle)
            R_rc = np.array([[c_rc, -s_rc], [s_rc, c_rc]])
            S_rc = np.array([[long_axis ** 2, 0.0], [0.0, short_axis ** 2]])
            rec_cov = R_rc.dot(S_rc).dot(R_rc.T)

            target_spot = rec_mu
            rec_influence = multivariate_normal(rec_mu, rec_cov).pdf(target_spot)

            # Defender influence at the same target spot
            total_def_influence = 0.0
            if not defenders.empty:
                for _, def_row in defenders.iterrows():
                    def_speed_val = def_row.get("s", 0.0)
                    def_dir_val = def_row.get("dir", 0.0)
                    if pd.isna(def_dir_val):
                        def_dir_val = 0.0
                    def_angle = np.radians(90.0 - def_dir_val)

                    def_pos = np.array([def_row["x_std"], def_row["y_std"]])
                    def_mu = def_pos + np.array(
                        [def_speed_val * np.sin(def_angle), def_speed_val * np.cos(def_angle)]
                    ) * 0.5

                    d_long = 2.0 + (def_speed_val / 4.0)
                    d_short = 1.5
                    c_df, s_df = np.cos(def_angle), np.sin(def_angle)
                    R_df = np.array([[c_df, -s_df], [s_df, c_df]])
                    S_df = np.array([[d_long ** 2, 0.0], [0.0, d_short ** 2]])
                    def_cov = R_df.dot(S_df).dot(R_df.T)

                    total_def_influence += multivariate_normal(def_mu, def_cov).pdf(target_spot)

            field_control = rec_influence / (rec_influence + total_def_influence + 1e-6)

            # 8. Route clock features relative to the detected break frame
            if break_frame_id == 0:
                # No distinct break detected (e.g., Go route) -> constant stem phase
                is_break_frame = 0
                frames_since_break = -30
            else:
                # Build a continuous frame index across input and output streams
                if frame_type == "input":
                    continuous_id = frame_id
                else:
                    # Output frames start immediately after the last input frame
                    continuous_id = max_input_id + frame_id

                raw_diff = int(continuous_id - break_frame_id)
                # Flag frames within ±1 of the break frame
                is_break_frame = 1 if abs(raw_diff) <= 1 else 0
                # Clip the clock to [-30, 30]
                frames_since_break = max(-30, min(30, raw_diff))
            
            return {
                "game_id": game_id,
                "play_id": play_id,
                "nfl_id": target_nfl_id,
                "frame_id": frame_id,
                "frame_type": frame_type,
                "x": target_frame["x_std"],
                "y": target_frame["y_std"],
                "sep_nearest": sep_nearest,
                "sep_second": sep_second,
                "num_def_within_2": num_def_within_2,
                "num_def_within_3": num_def_within_3,
                "num_def_within_5": num_def_within_5,
                "receiver_speed": receiver_speed,
                "receiver_accel": receiver_accel,
                "dist_to_ball": dist_to_ball,
                "leverage_angle": leverage_angle,
                "relative_velocity_angle": relative_velocity_angle,
                "time_to_ball": time_to_ball,
                "def_time_to_ball": def_time_to_ball,
                "time_advantage": time_advantage,
                "closing_time": closing_time,
                "pressure_score": pressure_score,
                "initial_leverage": initial_leverage if frame_id == 1 and frame_type == "input" else None,
                # NEW CONTEXT METRICS (pre-snap, constant across frames for this play)
                "is_press": is_press,
                "shade_encoded": shade_encoded,
                "receiver_pressure": receiver_pressure,  # Calculated using PDFs, accounts for nearest defenders within 6 yards
                "field_control": field_control,  # Spatial ownership / control score (0-1)
                "is_break_frame": is_break_frame,
                "frames_since_break": frames_since_break,
                "route": route,
                "pass_result": pass_result
            }
        
        # Process all input frames
        for _, target_frame_row in input_target_frames.iterrows():
            metrics = process_frame(target_frame_row, "input")
            frame_metrics.append(metrics)
        
        # Process all output frames
        for _, target_frame_row in output_target_frames.iterrows():
            metrics = process_frame(target_frame_row, "output")
            frame_metrics.append(metrics)
        
        return pd.DataFrame(frame_metrics)
    


class RouteDominanceVisualizer:
    """Create visualizations for route dominance"""
    
    def __init__(self, scorer: RouteDominanceScorer):
        self.scorer = scorer
    
    def visualize_play_dominance(self, game_id: int, play_id: int,
                                  target_nfl_id: int, save_path: Optional[str] = None,
                                  show_animation: bool = True):
        """
        Create animated visualization showing dominance evolution
        
        Args:
            game_id: Game identifier
            play_id: Play identifier
            target_nfl_id: NFL ID of targeted receiver
            save_path: Path to save animation (optional)
            show_animation: Whether to display animation
        """
        # Calculate frame metrics
        frame_metrics = self.scorer.calculate_frame_dominance(
            game_id, play_id, target_nfl_id
        )
        
        # Get all frames for visualization
        play_frames = self.scorer.all_frames_df[
            (self.scorer.all_frames_df["game_id"] == game_id) &
            (self.scorer.all_frames_df["play_id"] == play_id)
        ]
        
        # Get play context
        supp_row = self.scorer.supp_df[
            (self.scorer.supp_df["game_id"] == game_id) &
            (self.scorer.supp_df["play_id"] == play_id)
        ]
        
        route = frame_metrics["route"].iloc[0]
        pass_result = frame_metrics["pass_result"].iloc[0]
        
        # Get ball landing
        ball_land_x = play_frames["ball_land_x_std"].iloc[0]
        ball_land_y = play_frames["ball_land_y_std"].iloc[0]
        
        # Create figure
        fig = plt.figure(figsize=(20, 12))
        
        # Field view
        ax_field = plt.subplot2grid((3, 4), (0, 0), colspan=3, rowspan=3)
        
        # Dominance score over time
        ax_score = plt.subplot2grid((3, 4), (0, 3))
        
        # Running average
        ax_running = plt.subplot2grid((3, 4), (1, 3))
        
        # Info panel
        ax_info = plt.subplot2grid((3, 4), (2, 3))
        ax_info.axis('off')
        
        # Draw field
        def draw_field():
            ax_field.clear()
            ax_field.set_xlim(-5, FIELD_LENGTH + 5)
            ax_field.set_ylim(-5, FIELD_WIDTH + 5)
            ax_field.set_aspect('equal')
            ax_field.set_facecolor('#0d5f20')
            
            # End zones
            endzone = Rectangle((0, 0), 10, FIELD_WIDTH, facecolor='navy', alpha=0.5)
            ax_field.add_patch(endzone)
            endzone2 = Rectangle((FIELD_LENGTH - 10, 0), 10, FIELD_WIDTH, facecolor='navy', alpha=0.5)
            ax_field.add_patch(endzone2)
            
            # Yard lines
            for yard in range(10, int(FIELD_LENGTH - 10) + 1, 5):
                ax_field.axvline(x=yard, color='white', linewidth=0.5, alpha=0.3)
                if yard % 10 == 0:
                    ax_field.text(yard, FIELD_WIDTH/2, str(yard), ha='center', va='center',
                                 color='white', fontsize=8, fontweight='bold',
                                 bbox=dict(boxstyle='round', facecolor='black', alpha=0.5))
        
        # Animation function
        # Create continuous frame sequence: input frames first, then output frames
        # Since frame_id overlaps (both start at 1), we need to handle them separately
        input_frames_vis = play_frames[play_frames["frame_type"] == "input"].sort_values("frame_id")
        output_frames_vis = play_frames[play_frames["frame_type"] == "output"].sort_values("frame_id")
        
        # Create list of (frame_id, frame_type) tuples for proper sequencing
        frames_list = []
        for _, row in input_frames_vis.iterrows():
            frames_list.append((row["frame_id"], "input", row.name))
        for _, row in output_frames_vis.iterrows():
            frames_list.append((row["frame_id"], "output", row.name))
        
        total_frames = len(frames_list)
        
        def animate(frame_idx):
            frame_id, frame_type, row_idx = frames_list[frame_idx]
            
            # Draw field
            draw_field()
            
            # Get frame data - need to match both frame_id AND frame_type
            frame_data = play_frames[
                (play_frames["frame_id"] == frame_id) & 
                (play_frames["frame_type"] == frame_type)
            ]
            frame_metric = frame_metrics[
                (frame_metrics["frame_id"] == frame_id) & 
                (frame_metrics["frame_type"] == frame_type)
            ]
            
            if not frame_metric.empty:
                current_sep = frame_metric.iloc[0]["sep_nearest"]
                # Calculate running average up to current continuous frame
                running_avg = frame_metrics.iloc[:frame_idx+1]["sep_nearest"].mean()
            else:
                current_sep = 0.0
                running_avg = 0.0
            
            # Plot all players with color-coding based on separation (for targeted receiver)
            for _, player in frame_data.iterrows():
                is_target = player["nfl_id"] == target_nfl_id
                
                if is_target:
                    # Color-code targeted receiver by separation
                    if not frame_metric.empty:
                        sep = frame_metric.iloc[0]["sep_nearest"]
                        # Color scale: red (low separation) -> yellow (medium) -> green (high separation)
                        if sep < 2.0:
                            target_color = 'red'
                        elif sep < 5.0:
                            target_color = 'yellow'
                        else:
                            target_color = 'lime'
                    else:
                        target_color = 'yellow'
                        sep = 0.0
                    
                    # Highlight targeted receiver with separation-based color
                    ax_field.scatter(player["x_std"], player["y_std"], 
                                   c=target_color, s=400, marker='*',
                                   edgecolors='black', linewidths=3, zorder=10)
                    
                    # Add separation text near receiver
                    ax_field.annotate(f'{sep:.1f}', 
                                    (player["x_std"], player["y_std"]),
                                    xytext=(0, 15), textcoords='offset points',
                                    fontsize=12, fontweight='bold',
                                    bbox=dict(boxstyle='round,pad=0.5', 
                                            facecolor='white', alpha=0.9,
                                            edgecolor='black', linewidth=2),
                                    ha='center', zorder=11)
                else:
                    # Regular players
                    color = 'orange' if player["player_side"] == "Offense" else 'blue'
                    ax_field.scatter(player["x_std"], player["y_std"],
                                   c=color, s=150, alpha=0.7, zorder=5)
            
            # Plot ball landing
            ax_field.scatter(ball_land_x, ball_land_y, c='yellow', s=400,
                           marker='X', edgecolors='black', linewidths=2, zorder=9)
            
            # Draw separation circle for targeted receiver with separation-based color
            if not frame_metric.empty:
                target_row = frame_data[frame_data["nfl_id"] == target_nfl_id]
                if not target_row.empty:
                    target_x = target_row.iloc[0]["x_std"]
                    target_y = target_row.iloc[0]["y_std"]
                    sep = frame_metric.iloc[0]["sep_nearest"]
                    
                    if sep < np.inf:
                        # Color circle based on separation
                        if sep < 2.0:
                            circle_color = 'red'
                        elif sep < 5.0:
                            circle_color = 'orange'
                        else:
                            circle_color = 'green'
                        
                        circle = Circle((target_x, target_y), sep, fill=False,
                                      edgecolor=circle_color, linewidth=2.5, 
                                      linestyle='--', alpha=0.7)
                        ax_field.add_patch(circle)
                        
                        # Draw line to nearest defender
                        defenders = frame_data[frame_data["player_side"] == "Defense"]
                        if not defenders.empty:
                            def_dists = np.sqrt(
                                (defenders["x_std"] - target_x)**2 +
                                (defenders["y_std"] - target_y)**2
                            )
                            nearest_idx = def_dists.idxmin()
                            nearest_def = defenders.loc[nearest_idx]
                            ax_field.plot([target_x, nearest_def["x_std"]], 
                                         [target_y, nearest_def["y_std"]],
                                         color=circle_color, linewidth=2, 
                                         linestyle=':', alpha=0.6, zorder=1)
            
            # Title - show continuous frame number and frame type
            continuous_frame_num = frame_idx + 1
            ax_field.set_title(
                f"Game {game_id}, Play {play_id} | Frame {continuous_frame_num}/{total_frames} ({frame_type}) | "
                f"Route: {route} | Result: {pass_result}",
                fontsize=12, fontweight='bold', color='white', pad=10
            )
            
            # Update score plot - use continuous frame numbers
            ax_score.clear()
            # Create continuous frame numbers for plotting
            frame_metrics_plot = frame_metrics.copy()
            frame_metrics_plot['continuous_frame'] = range(1, len(frame_metrics_plot) + 1)
            
            ax_score.plot(frame_metrics_plot["continuous_frame"], frame_metrics_plot["sep_nearest"],
                         'b-', linewidth=2, label='Separation')
            ax_score.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=2)
            ax_score.set_xlabel('Frame Number (Continuous)')
            ax_score.set_ylabel('Separation (yards)')
            ax_score.set_title('Frame-by-Frame Separation')
            ax_score.grid(True, alpha=0.3)
            ax_score.legend()
            
            # Update running average
            ax_running.clear()
            running_avgs = []
            # Calculate running average up to current continuous frame
            for i in range(continuous_frame_num):
                if i < len(frame_metrics_plot):
                    avg = frame_metrics_plot.iloc[:i+1]["sep_nearest"].mean()
                    running_avgs.append(avg)
            
            if running_avgs:
                ax_running.plot(range(1, len(running_avgs)+1), running_avgs,
                              'g-', linewidth=2, marker='o', markersize=4, label='Running Average')
                ax_running.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=2)
                ax_running.axhline(y=running_avgs[-1], color='r', linestyle='--', linewidth=2, alpha=0.5)
            ax_running.set_xlabel('Frame Number (Continuous)')
            ax_running.set_ylabel('Cumulative Avg Separation')
            ax_running.set_title(f'Running Average: {running_avg:.2f} yds')
            ax_running.grid(True, alpha=0.3)
            ax_running.legend()
            
            # Update info panel
            ax_info.clear()
            ax_info.axis('off')
            
            if not frame_metric.empty:
                info_text = f"""
FRAME {continuous_frame_num}/{total_frames} ({frame_type.upper()})
{'='*40}
Current Separation: {current_sep:.2f} yds
Running Average: {running_avg:.2f} yds

SEPARATION
Nearest Defender: {frame_metric.iloc[0]['sep_nearest']:.2f} yds
Defenders within 3 yds: {frame_metric.iloc[0]['num_def_within_3']}

MOTION
Speed: {frame_metric.iloc[0]['receiver_speed']:.2f} yds/s
Acceleration: {frame_metric.iloc[0]['receiver_accel']:.2f} yds/s²

BALL PROXIMITY
Distance to Ball: {frame_metric.iloc[0]['dist_to_ball']:.2f} yds
"""
            else:
                info_text = f"Frame {continuous_frame_num}/{total_frames} ({frame_type})\nNo metrics available"
            
            # Add separation indicator box (large, prominent)
            sep_box_text = f"SEPARATION: {current_sep:.2f} yds"
            if current_sep < 2.0:
                sep_box_color = 'red'
            elif current_sep < 5.0:
                sep_box_color = 'yellow'
            else:
                sep_box_color = 'lime'
            
            ax_info.text(0.5, 0.98, sep_box_text, transform=ax_info.transAxes,
                        fontsize=16, fontweight='bold', ha='center',
                        verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round,pad=1', facecolor=sep_box_color, 
                                alpha=0.8, edgecolor='black', linewidth=3))
            
            ax_info.text(0.05, 0.85, info_text, transform=ax_info.transAxes,
                        fontsize=10, verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.9))
        
        # Create animation
        # Use continuous frame numbering for display
        total_frames = len(frames_list)
        print(f"Creating animation with {total_frames} frames ({len(input_frames_vis)} input + {len(output_frames_vis)} output)")
        
        anim = animation.FuncAnimation(fig, animate, frames=total_frames,
                                      interval=200, repeat=True)
        
        if save_path:
            print(f"Saving animation to {save_path}...")
            try:
                anim.save(save_path, writer='pillow', fps=5)
                print(f"✓ Animation saved successfully!")
            except Exception as e:
                print(f"Error saving animation: {e}")
                print("Trying with imagemagick writer...")
                try:
                    anim.save(save_path.replace('.gif', '.mp4'), writer='ffmpeg', fps=5)
                    print(f"✓ Saved as MP4 instead")
                except:
                    print("Could not save animation. Displaying instead...")
        
        if show_animation:
            plt.tight_layout()
            plt.show()
        
        return anim, frame_metrics


def prepare_lstm_features(scorer: RouteDominanceScorer, 
                          game_ids: List[int], play_ids: List[int]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Prepare features for LSTM model
    
    Args:
        scorer: RouteDominanceScorer instance
        game_ids: List of game IDs
        play_ids: List of play IDs
        
    Returns:
        X: Feature sequences (n_samples, n_frames, n_features)
        y: Route dominance scores (n_samples,)
    """
    sequences = []
    targets = []
    
    for game_id, play_id in zip(game_ids, play_ids):
        # Get targeted receiver
        play_input = scorer.input_df[
            (scorer.input_df["game_id"] == game_id) &
            (scorer.input_df["play_id"] == play_id) &
            (scorer.input_df["player_role"] == "Targeted Receiver")
        ]
        
        if play_input.empty:
            continue
        
        target_nfl_id = play_input["nfl_id"].iloc[0]
        
        # Calculate frame metrics
        try:
            frame_metrics = scorer.calculate_frame_dominance(game_id, play_id, target_nfl_id)
        except:
            continue
        
        # Extract features for each frame
        feature_cols = [
            'sep_nearest', 'sep_second', 'num_def_within_2', 'num_def_within_3',
            'num_def_within_5', 'receiver_speed', 'receiver_accel', 'dist_to_ball'
        ]
        
        # Fill NaN values
        frame_features = frame_metrics[feature_cols].fillna(0).values
        
        # Pad or truncate to fixed length (e.g., 30 frames)
        max_frames = 30
        if len(frame_features) < max_frames:
            padding = np.zeros((max_frames - len(frame_features), len(feature_cols)))
            frame_features = np.vstack([frame_features, padding])
        else:
            frame_features = frame_features[:max_frames]
        
        sequences.append(frame_features)
        
        # Use average separation as route-level metric
        route_dom = frame_metrics["sep_nearest"].mean()
        targets.append(route_dom)
    
    X = np.array(sequences)
    y = np.array(targets)
    
    return X, y




In [5]:
"""
Route Dominance Scoring System for NFL Plays

This module provides:
1. Frame-by-frame dominance scoring
2. Route-level aggregation
3. Interactive/animated visualizations
4. LSTM-based prediction approach
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.patches import Circle, Rectangle
from scipy.stats import multivariate_normal
try:
    import seaborn as sns
except ImportError:
    sns = None  # Optional dependency
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Field constants
FIELD_LENGTH = 120.0
FIELD_WIDTH = 53.3
FRAME_RATE = 10  # frames per second (NFL tracking data)


class RouteDominanceScorer:
    """Calculate frame-by-frame and route-level dominance scores"""
    
    def __init__(self, input_df: pd.DataFrame, output_df: pd.DataFrame, 
                 supp_df: pd.DataFrame):
        """
        Initialize with dataframes
        
        Args:
            input_df: Pre-throw tracking data (all frames before throw)
            output_df: Post-throw tracking data (ball in flight)
            supp_df: Supplementary play context data
        """
        self.input_df = input_df.copy()
        self.output_df = output_df.copy()
        self.supp_df = supp_df.copy()
        
        # Standardize coordinates
        self._standardize_coordinates()
        self._standardize_reciever_side()
        
        # Combine all frames
        self.all_frames_df = self._combine_all_frames()
        
    def _standardize_coordinates(self):
        """Standardize coordinates so offense always drives right"""
        # Standardize input data
        right_mask = self.input_df["play_direction"].str.lower() == "right"
        self.input_df["x_std"] = self.input_df["x"]
        self.input_df["y_std"] = self.input_df["y"]
        self.input_df.loc[~right_mask, "x_std"] = FIELD_LENGTH - self.input_df.loc[~right_mask, "x"]
        self.input_df.loc[~right_mask, "y_std"] = FIELD_WIDTH - self.input_df.loc[~right_mask, "y"]
        
        # Standardize ball landing
        self.input_df["ball_land_x_std"] = self.input_df["ball_land_x"]
        self.input_df["ball_land_y_std"] = self.input_df["ball_land_y"]
        self.input_df.loc[~right_mask, "ball_land_x_std"] = FIELD_LENGTH - self.input_df.loc[~right_mask, "ball_land_x"]
        self.input_df.loc[~right_mask, "ball_land_y_std"] = FIELD_WIDTH - self.input_df.loc[~right_mask, "ball_land_y"]
        
        # Get play direction mapping for output
        play_dir_map = self.input_df[["game_id", "play_id", "play_direction"]].drop_duplicates()
        self.output_df = self.output_df.merge(play_dir_map, on=["game_id", "play_id"], how="left")
        
        # Standardize output data
        right_mask_out = self.output_df["play_direction"].str.lower() == "right"
        self.output_df["x_std"] = self.output_df["x"]
        self.output_df["y_std"] = self.output_df["y"]
        self.output_df.loc[~right_mask_out, "x_std"] = FIELD_LENGTH - self.output_df.loc[~right_mask_out, "x"]
        self.output_df.loc[~right_mask_out, "y_std"] = FIELD_WIDTH - self.output_df.loc[~right_mask_out, "y"]
        
        # Add velocity components
        self.input_df["vx"] = self.input_df["s"] * np.cos(np.deg2rad(self.input_df["dir"].fillna(0)))
        self.input_df["vy"] = self.input_df["s"] * np.sin(np.deg2rad(self.input_df["dir"].fillna(0)))

    def _standardize_reciever_side(self):
        """Flip coordinates of players so reciever always above qb (on the left side from qb's perspective)"""
        #Find what side the reciever is on
        qb_pos = self.input_df[(self.input_df["player_role"] == "Passer") & (self.input_df['frame_id'] == 1)][['y_std', 'play_id', 'game_id']]
        wr_pos = self.input_df[(self.input_df["player_role"] == "Targeted Receiver") & (self.input_df['frame_id'] == 1)][['y_std', 'play_id', 'game_id']]
        merged_pos = pd.merge(qb_pos, wr_pos, on=['play_id', 'game_id'], how='left')
        merged_pos['receiver_side'] = np.where(merged_pos['y_std_x'] < merged_pos['y_std_y'], 'left', 'right')
        
        # Merge receiver_side into input_df, output_df, and supp_df based on game_id and play_id
        receiver_side_df = merged_pos[['game_id', 'play_id', 'receiver_side']]
        self.supp_df = self.supp_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')
        self.input_df = self.input_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')
        self.output_df = self.output_df.merge(receiver_side_df, on=['game_id', 'play_id'], how='left')

        #Flip the field of inputs, outputs and ball landing when reciever aligns to right of qb
        self.input_df.loc[self.input_df['receiver_side'] == 'right', 'y_std'] = FIELD_WIDTH - self.input_df.loc[self.input_df['receiver_side'] == 'right', 'y_std']
        self.output_df.loc[self.output_df['receiver_side'] == 'right', 'y_std'] = FIELD_WIDTH - self.output_df.loc[self.output_df['receiver_side'] == 'right', 'y_std']
        self.input_df.loc[self.input_df['receiver_side'] == 'right', 'ball_land_y_std'] = FIELD_WIDTH - self.input_df.loc[self.input_df['receiver_side'] == 'right', 'ball_land_y_std']
    
    def _combine_all_frames(self) -> pd.DataFrame:
        """
        Combine input and output frames to get complete play sequence
        
        Returns:
            DataFrame with all frames for each play
        """
        # Get metadata from input (player info, ball landing, etc.)
        input_meta = self.input_df[[
            "game_id", "play_id", "nfl_id", "player_name", "player_position",
            "player_side", "player_role", "ball_land_x_std", "ball_land_y_std",
            "num_frames_output"
        ]].drop_duplicates()
        
        # Prepare input frames (pre-throw)
        input_frames = self.input_df[[
            "game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std",
            "s", "a", "dir", "vx", "vy"
        ]].copy()
        input_frames["frame_type"] = "input"
        
        # Prepare output frames (ball in flight)
        # Need to map frame_id: output frames start at 1, but they correspond to
        # frames after the throw. We'll offset them
        output_frames = self.output_df[[
            "game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std"
        ]].copy()
        
        # Merge with input to get speed/acceleration if available
        # For output frames, we'll need to estimate or use previous values
        output_frames = output_frames.merge(
            input_meta, on=["game_id", "play_id", "nfl_id"], how="left"
        )
        
        # Get the last input frame for each player to estimate velocity
        last_input = input_frames.sort_values("frame_id").groupby(
            ["game_id", "play_id", "nfl_id"]
        ).last().reset_index()
        
        # For output frames, estimate speed from position changes
        output_frames = output_frames.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
        
        # Initialize speed/acceleration columns
        output_frames["s"] = np.nan
        output_frames["a"] = np.nan
        output_frames["dir"] = np.nan
        output_frames["vx"] = np.nan
        output_frames["vy"] = np.nan
        
        # Calculate speed from position changes
        for (gid, pid, nid), group in output_frames.groupby(["game_id", "play_id", "nfl_id"]):
            group = group.sort_values("frame_id")
            
            # Get last input frame for this player to use as starting point
            last_input_row = last_input[
                (last_input["game_id"] == gid) &
                (last_input["play_id"] == pid) &
                (last_input["nfl_id"] == nid)
            ]
            
            # Calculate position changes
            group["prev_x"] = group["x_std"].shift(1)
            group["prev_y"] = group["y_std"].shift(1)
            
            # For first output frame, use last input position
            if not last_input_row.empty:
                first_idx = group.index[0]
                output_frames.loc[first_idx, "prev_x"] = last_input_row.iloc[0]["x_std"]
                output_frames.loc[first_idx, "prev_y"] = last_input_row.iloc[0]["y_std"]
            
            # Calculate displacement
            dx = group["x_std"] - group["prev_x"]
            dy = group["y_std"] - group["prev_y"]
            
            # Calculate speed (yards per second)
            # Frame rate is 10 fps, so multiply by 10 to get yards/second
            speed = np.sqrt(dx**2 + dy**2) * FRAME_RATE
            direction = np.rad2deg(np.arctan2(dy, dx))
            
            # Calculate acceleration (change in speed)
            prev_speed = speed.shift(1)
            if not last_input_row.empty:
                prev_speed.iloc[0] = last_input_row.iloc[0]["s"]
            acceleration = (speed - prev_speed) * FRAME_RATE
            
            # Calculate velocity components
            vx = speed * np.cos(np.deg2rad(direction))
            vy = speed * np.sin(np.deg2rad(direction))
            
            # Fill NaN values with last known values
            speed = speed.ffill().fillna(0)
            direction = direction.ffill().fillna(0)
            acceleration = acceleration.fillna(0)
            vx = vx.fillna(0)
            vy = vy.fillna(0)
            
            # Special handling for first output frame: use average of last input and second output
            if len(group) > 1 and not last_input_row.empty:
                first_idx = group.index[0]  # First output frame
                second_idx = group.index[1]  # Second output frame
                
                last_input_speed = last_input_row.iloc[0]["s"]
                second_output_speed = speed.loc[second_idx]
                
                # Average the speeds for the first output frame
                if not np.isnan(second_output_speed) and not np.isnan(last_input_speed):
                    avg_speed = (last_input_speed + second_output_speed) / 2.0
                    speed.loc[first_idx] = avg_speed
                    
                    # Recalculate direction and velocity for first frame using averaged speed
                    first_dx = group.loc[first_idx, "x_std"] - group.loc[first_idx, "prev_x"]
                    first_dy = group.loc[first_idx, "y_std"] - group.loc[first_idx, "prev_y"]
                    first_dir = np.rad2deg(np.arctan2(first_dy, first_dx))
                    direction.loc[first_idx] = first_dir
                    vx.loc[first_idx] = avg_speed * np.cos(np.deg2rad(first_dir))
                    vy.loc[first_idx] = avg_speed * np.sin(np.deg2rad(first_dir))
                    
                    # Recalculate acceleration for first frame
                    acceleration.loc[first_idx] = (avg_speed - last_input_speed) * FRAME_RATE
            
            # Update output_frames
            for idx in group.index:
                output_frames.loc[idx, "s"] = speed.loc[idx]
                output_frames.loc[idx, "dir"] = direction.loc[idx]
                output_frames.loc[idx, "a"] = acceleration.loc[idx]
                output_frames.loc[idx, "vx"] = vx.loc[idx]
                output_frames.loc[idx, "vy"] = vy.loc[idx]
        
        # Clean up temporary columns
        if "prev_x" in output_frames.columns:
            output_frames = output_frames.drop(columns=["prev_x", "prev_y"])
        output_frames["frame_type"] = "output"
        
        # Combine input and output
        input_cols = ["game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std", 
                     "s", "a", "dir", "vx", "vy", "frame_type"]
        output_cols = ["game_id", "play_id", "nfl_id", "frame_id", "x_std", "y_std",
                      "s", "a", "dir", "vx", "vy", "frame_type"]
        
        all_frames = pd.concat([
            input_frames[input_cols],
            output_frames[output_cols]
        ], ignore_index=True)
        
        # Add metadata
        all_frames = all_frames.merge(input_meta, on=["game_id", "play_id", "nfl_id"], how="left")
        
        # Sort by frame_id
        all_frames = all_frames.sort_values(["game_id", "play_id", "frame_id", "nfl_id"])
        
        return all_frames
    
    def _detect_route_break(self, play_frames: pd.DataFrame) -> int:
        """
        Identify the frame_id where the route 'break' (hard cut) occurs for a receiver.
        Returns 0 if no distinct break is found (e.g., Go / Seam route).
        """
        # Consider only input frames (route running phase), ordered by frame_id
        route_frames = play_frames[play_frames["frame_type"] == "input"].sort_values("frame_id")
        if len(route_frames) < 5:
            return 0

        # 1. Get directions and fill NaNs forward, then with 0
        dirs = route_frames["dir"].fillna(method="ffill").fillna(0).values

        # 2. Direction change between consecutive frames (delta angle)
        diffs = np.abs(dirs[1:] - dirs[:-1])
        # Handle wrap-around at 360° (e.g. 350 -> 10 should be 20°, not 340°)
        diffs = np.where(diffs > 180.0, 360.0 - diffs, diffs)

        # 3. Smooth with a running average over 3 frames
        smooth_diffs = np.convolve(diffs, np.ones(3) / 3.0, mode="valid")
        if len(smooth_diffs) == 0:
            return 0

        # 4. Find the sharpest turn
        max_idx = int(np.argmax(smooth_diffs))
        max_val = float(smooth_diffs[max_idx])

        # Threshold: treat <15° as "no break" (Go / Seam)
        if max_val < 15.0:
            return 0

        # 5. Map back to frame index (account for diff + convolution offset)
        break_frame_index = max_idx + 1
        if break_frame_index >= len(route_frames):
            return 0

        return int(route_frames.iloc[break_frame_index]["frame_id"])
    
    def calculate_frame_dominance(self, game_id: int, play_id: int, 
                                   target_nfl_id: int) -> pd.DataFrame:
        """
        Calculate dominance score for each frame of a play
        
        Args:
            game_id: Game identifier
            play_id: Play identifier
            target_nfl_id: NFL ID of targeted receiver
            
        Returns:
            DataFrame with frame-by-frame dominance metrics
        """
        # Get all frames for this play
        play_frames = self.all_frames_df[
            (self.all_frames_df["game_id"] == game_id) &
            (self.all_frames_df["play_id"] == play_id)
        ].copy()
        
        if play_frames.empty:
            raise ValueError(f"Play {game_id}-{play_id} not found")
        
        # Get targeted receiver frames
        target_frames = play_frames[play_frames["nfl_id"] == target_nfl_id].copy()
        
        if target_frames.empty:
            raise ValueError(f"Targeted receiver {target_nfl_id} not found in play")
        
        # NEW: Detect route break (once per play, for this receiver)
        break_frame_id = self._detect_route_break(target_frames)
        
        # Get max input frame_id to build a continuous frame index across input/output
        input_subset = play_frames[play_frames["frame_type"] == "input"]
        max_input_id = int(input_subset["frame_id"].max()) if not input_subset.empty else 0

        # =================================================================
        # NEW: CALCULATE CONTEXT AT SNAP (Frame 1) - MATCHING ECP LOGIC
        # =================================================================
        # 1. Get positions at Frame 1 (the snap) using input frames
        frame_1_data = play_frames[
            (play_frames["frame_id"] == 1) &
            (play_frames["frame_type"] == "input")
        ]
        target_f1 = frame_1_data[frame_1_data["nfl_id"] == target_nfl_id]
        
        # Default values (if missing data)
        is_press = 0
        shade_encoded = 0 
        
        if not target_f1.empty and not frame_1_data.empty:
            # Receiver position at snap
            rec_y = target_f1.iloc[0]["y_std"]
            rec_x = target_f1.iloc[0]["x_std"]
            
            # Find the nearest defender at snap
            defenders_f1 = frame_1_data[frame_1_data["player_side"] == "Defense"]
            
            if not defenders_f1.empty:
                # Calculate distances to receiver
                dists = np.sqrt(
                    (defenders_f1["x_std"] - rec_x) ** 2 +
                    (defenders_f1["y_std"] - rec_y) ** 2
                )
                
                # Get nearest defender details
                nearest_idx = dists.idxmin()
                nearest_def_f1 = defenders_f1.loc[nearest_idx]
                min_dist = float(dists.min())
                def_y = nearest_def_f1["y_std"]
                
                # --- A. PRESS LOGIC (From ECP Script) ---
                # Threshold: < 5.0 yards
                is_press = 1 if min_dist < 5.0 else 0
                    
                # --- B. SHADE LOGIC (From ECP Script) ---
                # Formula: abs(RecY - Center) - abs(DefY - Center)
                # If Rec is further from center than Def -> Def is Inside -> Positive Score
                shade_score = abs(rec_y - 26.65) - abs(def_y - 26.65)
                
                if shade_score > 0.5:
                    shade_encoded = 1   # Inside Shade
                elif shade_score < -0.5:
                    shade_encoded = -1  # Outside Shade
                else:
                    shade_encoded = 0   # Head-Up
        # =================================================================
        
        # Get ball landing coordinates
        ball_land_x = target_frames["ball_land_x_std"].iloc[0]
        ball_land_y = target_frames["ball_land_y_std"].iloc[0]
        num_frames_output = target_frames["num_frames_output"].iloc[0]
        
        # Get play context
        supp_row = self.supp_df[
            (self.supp_df["game_id"] == game_id) &
            (self.supp_df["play_id"] == play_id)
        ]
        
        if supp_row.empty:
            route = "UNKNOWN"
            pass_result = "UNKNOWN"
        else:
            route = supp_row.iloc[0].get("route_of_targeted_receiver", "UNKNOWN")
            pass_result = supp_row.iloc[0].get("pass_result", "UNKNOWN")
        
        # Calculate metrics for each frame
        # Need to iterate through both input and output frames separately
        # because they have overlapping frame_ids (both start at 1)
        frame_metrics = []
        
        # Get input and output frames separately
        input_target_frames = target_frames[target_frames["frame_type"] == "input"].sort_values("frame_id")
        output_target_frames = target_frames[target_frames["frame_type"] == "output"].sort_values("frame_id")
        
        # Helper function to process a single frame
        def process_frame(target_frame_row, frame_type):
            frame_id = target_frame_row["frame_id"]
            target_frame = target_frame_row
            
            # Get all players at this frame (must match both frame_id AND frame_type)
            # This ensures we get all players from the output file for output frames
            frame_players = play_frames[
                (play_frames["frame_id"] == frame_id) & 
                (play_frames["frame_type"] == frame_type)
            ]
            
            # Get defenders
            defenders = frame_players[frame_players["player_side"] == "Defense"]
            
            # 1. Separation from nearest defenders (1st, 2nd, 3rd)
            def_dists = None
            nearest_def_indices = None
            
            if not defenders.empty:
                def_dists = np.sqrt(
                    (defenders["x_std"] - target_frame["x_std"])**2 +
                    (defenders["y_std"] - target_frame["y_std"])**2
                )
                
                # Sort defenders by separation to get 1st, 2nd, 3rd nearest
                sorted_dists = def_dists.sort_values()
                nearest_def_indices = sorted_dists.index[:3]
                
                sep_nearest = sorted_dists.iloc[0] if len(sorted_dists) >= 1 else np.inf
                sep_second = sorted_dists.iloc[1] if len(sorted_dists) >= 2 else np.nan
                sep_third = sorted_dists.iloc[2] if len(sorted_dists) >= 3 else np.nan
                
                num_def_within_2 = (def_dists <= 2.0).sum()
                num_def_within_3 = (def_dists <= 3.0).sum()
                num_def_within_5 = (def_dists <= 5.0).sum()
            else:
                sep_nearest = np.inf
                sep_second = np.nan
                sep_third = np.nan
                num_def_within_2 = 0
                num_def_within_3 = 0
                num_def_within_5 = 0
            
            # 2. Receiver speed and acceleration
            receiver_speed = target_frame["s"]
            receiver_accel = target_frame["a"]
            
            # 3. Distance to ball landing spot
            dist_to_ball = np.sqrt(
                (target_frame["x_std"] - ball_land_x)**2 +
                (target_frame["y_std"] - ball_land_y)**2
            )
            
            # 4. Leverage Angle - angle between defender-to-receiver and receiver-to-ball vectors
            leverage_angle = np.nan
            nearest_def = None
            if not defenders.empty and sep_nearest < np.inf:
                nearest_def = defenders.loc[def_dists.idxmin()]
                
                # Vector from nearest defender to receiver
                def_to_rec_x = target_frame["x_std"] - nearest_def["x_std"]
                def_to_rec_y = target_frame["y_std"] - nearest_def["y_std"]
                
                # Vector from receiver to ball landing
                rec_to_ball_x = ball_land_x - target_frame["x_std"]
                rec_to_ball_y = ball_land_y - target_frame["y_std"]
                
                # Calculate angle between vectors using dot product
                # angle = arccos((v1 · v2) / (||v1|| * ||v2||))
                dot_product = def_to_rec_x * rec_to_ball_x + def_to_rec_y * rec_to_ball_y
                mag_def_to_rec = np.sqrt(def_to_rec_x**2 + def_to_rec_y**2)
                mag_rec_to_ball = np.sqrt(rec_to_ball_x**2 + rec_to_ball_y**2)
                
                if mag_def_to_rec > 0 and mag_rec_to_ball > 0:
                    cos_angle = dot_product / (mag_def_to_rec * mag_rec_to_ball)
                    # Clamp to [-1, 1] to avoid numerical errors
                    cos_angle = np.clip(cos_angle, -1.0, 1.0)
                    angle_rad = np.arccos(cos_angle)
                    angle_deg = np.rad2deg(angle_rad)
                    
                    # Normalize to always be the smaller angle (≤ 180 degrees)
                    # arccos gives [0, π] which is [0, 180°], so we take the smaller of angle and 180-angle
                    leverage_angle = min(angle_deg, 180.0 - angle_deg)
                else:
                    leverage_angle = np.nan

            # 4b. Relative velocity angle (cosine similarity of movement directions)
            #     between the receiver and the nearest defender.
            relative_velocity_angle = np.nan
            rec_dir_val = target_frame.get("dir", np.nan)
            if nearest_def is not None and not np.isnan(rec_dir_val):
                def_dir_val = nearest_def.get("dir", np.nan)
                if not np.isnan(def_dir_val):
                    rec_rad = np.deg2rad(rec_dir_val)
                    def_rad = np.deg2rad(def_dir_val)
                    relative_velocity_angle = float(np.cos(rec_rad - def_rad))
            
            # 5. Time-to-ball metrics and closing-time-based pressure score
            # Receiver time to ball
            to_ball_x = ball_land_x - target_frame["x_std"]
            to_ball_y = ball_land_y - target_frame["y_std"]
            to_ball_dist = np.sqrt(to_ball_x**2 + to_ball_y**2)
            if to_ball_dist > 0 and not np.isnan(target_frame.get("vx", np.nan)):
                rec_vx = float(target_frame["vx"])
                rec_vy = float(target_frame["vy"])
                to_ball_unit_x = to_ball_x / to_ball_dist
                to_ball_unit_y = to_ball_y / to_ball_dist
                speed_toward_ball = rec_vx * to_ball_unit_x + rec_vy * to_ball_unit_y
                if speed_toward_ball > 0:
                    time_to_ball = to_ball_dist / max(speed_toward_ball, 1e-6)
                else:
                    time_to_ball = np.inf
            else:
                time_to_ball = np.inf

            # Defender time to ball for nearest defender (if any)
            if nearest_def is not None:
                def_to_ball_x = float(ball_land_x - nearest_def["x_std"])
                def_to_ball_y = float(ball_land_y - nearest_def["y_std"])
                def_to_ball_dist = np.sqrt(def_to_ball_x**2 + def_to_ball_y**2)
                def_speed = float(nearest_def.get("s", 0.0))
                def_dir = float(nearest_def.get("dir", 0.0)) if not np.isnan(
                    nearest_def.get("dir", np.nan)
                ) else 0.0

                if def_to_ball_dist > 0 and def_speed > 0:
                    def_vx = float(nearest_def.get("vx", def_speed * np.cos(np.deg2rad(def_dir))))
                    def_vy = float(nearest_def.get("vy", def_speed * np.sin(np.deg2rad(def_dir))))
                    def_to_ball_unit_x = def_to_ball_x / def_to_ball_dist
                    def_to_ball_unit_y = def_to_ball_y / def_to_ball_dist
                    def_speed_toward_ball = def_vx * def_to_ball_unit_x + def_vy * def_to_ball_unit_y
                    if def_speed_toward_ball > 0:
                        def_time_to_ball = def_to_ball_dist / max(def_speed_toward_ball, 1e-6)
                    else:
                        def_time_to_ball = np.inf
                else:
                    def_time_to_ball = np.inf
            else:
                def_time_to_ball = np.inf

            # Time advantage: defender time minus receiver time (positive = defender later)
            if np.isfinite(time_to_ball) and np.isfinite(def_time_to_ball):
                time_advantage = def_time_to_ball - time_to_ball
            else:
                time_advantage = np.nan

            # Closing time for pressure: how long until defender arrives after receiver (0 = already closed)
            if np.isfinite(time_to_ball) and np.isfinite(def_time_to_ball):
                closing_time = max(0.0, def_time_to_ball - time_to_ball)
            else:
                closing_time = np.inf

            # Map closing_time to a 0–100 pressure score (higher = more defensive pressure)
            if not np.isfinite(closing_time):
                pressure_score = 0
            else:
                # Exponential decay: 0s -> ~100, 0.7s -> ~50, 3s -> ~5
                pressure_score = int(np.clip(100.0 * np.exp(-1.0 * closing_time), 0.0, 100.0))
            
            # 6. Initial Leverage (angle advantage) - calculated at start of route
            # Leverage: is the defender between the receiver and the ball?
            if frame_id == 1 and not defenders.empty:
                # Get initial positions
                target_start_x = target_frame["x_std"]
                target_start_y = target_frame["y_std"]
                
                # Vector from target to ball
                target_to_ball_x = ball_land_x - target_start_x
                target_to_ball_y = ball_land_y - target_start_y
                
                leverage_scores = []
                for _, def_row in defenders.iterrows():
                    # Vector from target to defender
                    target_to_def_x = def_row["x_std"] - target_start_x
                    target_to_def_y = def_row["y_std"] - target_start_y
                    
                    # Check if defender is "between" target and ball
                    # Using dot product to check alignment
                    dot_product = (target_to_def_x * target_to_ball_x + 
                                 target_to_def_y * target_to_ball_y)
                    
                    if dot_product > 0:  # Defender is in front (bad leverage)
                        leverage = -1.0
                    else:  # Defender is behind (good leverage)
                        leverage = 1.0
                    
                    leverage_scores.append(leverage)
                
                initial_leverage = np.mean(leverage_scores) if leverage_scores else 0.0
            else:
                initial_leverage = np.nan if frame_id == 1 else None
            
            # 6. Calculate Receiver Pressure using multivariate normal PDFs
            # Use up to the 3 nearest defenders within 6 yards
            
            # Step 1: Receiver Influence PDF (6-yard radius)
            # Center is 2 yards in front of receiver, towards ball landing position
            receiver_x = target_frame["x_std"]
            receiver_y = target_frame["y_std"]
            
            # Calculate direction vector from receiver to ball landing
            to_ball_x = ball_land_x - receiver_x
            to_ball_y = ball_land_y - receiver_y
            dist_to_ball = np.sqrt(to_ball_x**2 + to_ball_y**2)
            
            # Normalize direction vector and move center 2 yards forward
            if dist_to_ball > 0:
                unit_x = to_ball_x / dist_to_ball
                unit_y = to_ball_y / dist_to_ball
            else:
                # If ball is at receiver position, use default direction
                unit_x = 1.0
                unit_y = 0.0
            
            # Center of influence zone: 2 yards in front of receiver (towards ball)
            center_x = receiver_x + 2.0 * unit_x
            center_y = receiver_y + 2.0 * unit_y
            
            # Create field grid for PDF calculation
            x_grid, y_grid = np.mgrid[0:53.3:0.5, 0:120:0.5]
            locations = np.dstack((x_grid, y_grid))
            
            # Receiver PDF: 6-yard radius circular distribution, centered 2 yards in front
            receiver_pdf = multivariate_normal([center_y, center_x], [[6, 0], [0, 6]]).pdf(locations)
            
            # Step 2: Defender Pressure PDF - sum up to the 3 nearest defenders within 6 yards
            # Each of the 1st, 2nd, 3rd nearest defenders (if present) gets its own PDF,
            # weighted by separation distance
            defender_pdf = np.zeros_like(receiver_pdf)
            
            if def_dists is not None and nearest_def_indices is not None:
                for def_idx in nearest_def_indices:
                    def_row = defenders.loc[def_idx]
                    def_x = def_row["x_std"]
                    def_y = def_row["y_std"]
                    
                    # Separation for this specific defender
                    separation = float(def_dists.loc[def_idx])
                    
                    # Only include defenders within 6 yards
                    if separation <= 6.0:
                        # Weight function: w(s) = 1 / (1 + s/5) where s = separation in yards
                        # Closer defenders have more influence
                        def_weight = 1.0 / (1.0 + separation / 5.0)
                        
                        # Defender PDF: 4-yard radius, weighted by separation
                        # Each defender contributes its own pressure PDF
                        def_pdf = multivariate_normal([def_y, def_x], [[4, 0], [0, 4]]).pdf(locations) * def_weight
                        
                        # Add to total defender pressure (sum of all defender PDFs)
                        defender_pdf = defender_pdf + def_pdf
            
            # Step 3: Dominance Ratio
            # Combine both PDFs
            total_pdf = receiver_pdf + defender_pdf + 1e-10  # epsilon to prevent division by zero
            
            # Dominance ratio: receiver influence / total influence
            # Values range from 0 to 1:
            # 1.0 = pure receiver influence (no defender pressure)
            # 0.0 = pure defender pressure (no receiver influence)
            dominance_pdf = receiver_pdf / total_pdf
            
            # Step 4: Final Receiver Pressure Score
            # Focus on the receiver's immediate area (6-yard radius, centered 2 yards in front)
            receiver_area_pdf = multivariate_normal([center_y, center_x], [[6, 0], [0, 6]]).pdf(locations)
            
            # Weight dominance by receiver area to focus on receiver's immediate area
            weighted_dominance = receiver_area_pdf * dominance_pdf
            
            # Calculate average dominance in receiver's area
            receiver_pressure = np.sum(weighted_dominance) / np.sum(receiver_area_pdf)
            
            # Step 5: Normalization
            # Normalize: (val - 0.5) / (0.8 - 0.5)
            receiver_pressure = (receiver_pressure - 0.50) / (0.80 - 0.50)
            
            # Clamp to [0, 1]
            receiver_pressure = max(0.0, min(1.0, receiver_pressure))

            # 7. Spatial ownership / field control at projected receiver spot
            #    (Physics-based control score, 0-1)
            rec_pos = np.array([receiver_x, receiver_y])
            rec_speed_val = float(receiver_speed) if not pd.isna(receiver_speed) else 0.0
            rec_dir_val = target_frame.get("dir", 0.0)
            if pd.isna(rec_dir_val):
                rec_dir_val = 0.0
            rec_angle = np.radians(90.0 - rec_dir_val)

            # Project receiver 0.5s into the future
            rec_mu = rec_pos + np.array(
                [rec_speed_val * np.sin(rec_angle), rec_speed_val * np.cos(rec_angle)]
            ) * 0.5

            # Receiver influence ellipse
            long_axis = 2.0 + (rec_speed_val / 4.0)
            short_axis = 1.5
            c_rc, s_rc = np.cos(rec_angle), np.sin(rec_angle)
            R_rc = np.array([[c_rc, -s_rc], [s_rc, c_rc]])
            S_rc = np.array([[long_axis ** 2, 0.0], [0.0, short_axis ** 2]])
            rec_cov = R_rc.dot(S_rc).dot(R_rc.T)

            target_spot = rec_mu
            rec_influence = multivariate_normal(rec_mu, rec_cov).pdf(target_spot)

            # Defender influence at the same target spot
            total_def_influence = 0.0
            if not defenders.empty:
                for _, def_row in defenders.iterrows():
                    def_speed_val = def_row.get("s", 0.0)
                    def_dir_val = def_row.get("dir", 0.0)
                    if pd.isna(def_dir_val):
                        def_dir_val = 0.0
                    def_angle = np.radians(90.0 - def_dir_val)

                    def_pos = np.array([def_row["x_std"], def_row["y_std"]])
                    def_mu = def_pos + np.array(
                        [def_speed_val * np.sin(def_angle), def_speed_val * np.cos(def_angle)]
                    ) * 0.5

                    d_long = 2.0 + (def_speed_val / 4.0)
                    d_short = 1.5
                    c_df, s_df = np.cos(def_angle), np.sin(def_angle)
                    R_df = np.array([[c_df, -s_df], [s_df, c_df]])
                    S_df = np.array([[d_long ** 2, 0.0], [0.0, d_short ** 2]])
                    def_cov = R_df.dot(S_df).dot(R_df.T)

                    total_def_influence += multivariate_normal(def_mu, def_cov).pdf(target_spot)

            field_control = rec_influence / (rec_influence + total_def_influence + 1e-6)

            # 8. Route clock features relative to the detected break frame
            if break_frame_id == 0:
                # No distinct break detected (e.g., Go route) -> constant stem phase
                is_break_frame = 0
                frames_since_break = -30
            else:
                # Build a continuous frame index across input and output streams
                if frame_type == "input":
                    continuous_id = frame_id
                else:
                    # Output frames start immediately after the last input frame
                    continuous_id = max_input_id + frame_id

                raw_diff = int(continuous_id - break_frame_id)
                # Flag frames within ±1 of the break frame
                is_break_frame = 1 if abs(raw_diff) <= 1 else 0
                # Clip the clock to [-30, 30]
                frames_since_break = max(-30, min(30, raw_diff))
            
            return {
                "game_id": game_id,
                "play_id": play_id,
                "nfl_id": target_nfl_id,
                "frame_id": frame_id,
                "frame_type": frame_type,
                "x": target_frame["x_std"],
                "y": target_frame["y_std"],
                "sep_nearest": sep_nearest,
                "sep_second": sep_second,
                "num_def_within_2": num_def_within_2,
                "num_def_within_3": num_def_within_3,
                "num_def_within_5": num_def_within_5,
                "receiver_speed": receiver_speed,
                "receiver_accel": receiver_accel,
                "dist_to_ball": dist_to_ball,
                "leverage_angle": leverage_angle,
                "relative_velocity_angle": relative_velocity_angle,
                "time_to_ball": time_to_ball,
                "def_time_to_ball": def_time_to_ball,
                "time_advantage": time_advantage,
                "closing_time": closing_time,
                "pressure_score": pressure_score,
                "initial_leverage": initial_leverage if frame_id == 1 and frame_type == "input" else None,
                # NEW CONTEXT METRICS (pre-snap, constant across frames for this play)
                "is_press": is_press,
                "shade_encoded": shade_encoded,
                "receiver_pressure": receiver_pressure,  # Calculated using PDFs, accounts for nearest defenders within 6 yards
                "field_control": field_control,  # Spatial ownership / control score (0-1)
                "is_break_frame": is_break_frame,
                "frames_since_break": frames_since_break,
                "route": route,
                "pass_result": pass_result
            }
        
        # Process all input frames
        for _, target_frame_row in input_target_frames.iterrows():
            metrics = process_frame(target_frame_row, "input")
            frame_metrics.append(metrics)
        
        # Process all output frames
        for _, target_frame_row in output_target_frames.iterrows():
            metrics = process_frame(target_frame_row, "output")
            frame_metrics.append(metrics)
        
        return pd.DataFrame(frame_metrics)
    

class RouteDominanceVisualizer:
    """Create visualizations for route dominance"""
    
    def __init__(self, scorer: RouteDominanceScorer):
        self.scorer = scorer
    
    def visualize_play_dominance(self, game_id: int, play_id: int,
                                  target_nfl_id: int, save_path: Optional[str] = None,
                                  show_animation: bool = True):
        """
        Create animated visualization showing dominance evolution
        
        Args:
            game_id: Game identifier
            play_id: Play identifier
            target_nfl_id: NFL ID of targeted receiver
            save_path: Path to save animation (optional)
            show_animation: Whether to display animation
        """
        # Calculate frame metrics
        frame_metrics = self.scorer.calculate_frame_dominance(
            game_id, play_id, target_nfl_id
        )
        
        # Get all frames for visualization
        play_frames = self.scorer.all_frames_df[
            (self.scorer.all_frames_df["game_id"] == game_id) &
            (self.scorer.all_frames_df["play_id"] == play_id)
        ]
        
        # Get play context
        supp_row = self.scorer.supp_df[
            (self.scorer.supp_df["game_id"] == game_id) &
            (self.scorer.supp_df["play_id"] == play_id)
        ]
        
        route = frame_metrics["route"].iloc[0]
        pass_result = frame_metrics["pass_result"].iloc[0]
        
        # Get ball landing
        ball_land_x = play_frames["ball_land_x_std"].iloc[0]
        ball_land_y = play_frames["ball_land_y_std"].iloc[0]
        
        # Create figure
        fig = plt.figure(figsize=(20, 12))
        
        # Field view
        ax_field = plt.subplot2grid((3, 4), (0, 0), colspan=3, rowspan=3)
        
        # Dominance score over time
        ax_score = plt.subplot2grid((3, 4), (0, 3))
        
        # Running average
        ax_running = plt.subplot2grid((3, 4), (1, 3))
        
        # Info panel
        ax_info = plt.subplot2grid((3, 4), (2, 3))
        ax_info.axis('off')
        
        # Draw field
        def draw_field():
            ax_field.clear()
            ax_field.set_xlim(-5, FIELD_LENGTH + 5)
            ax_field.set_ylim(-5, FIELD_WIDTH + 5)
            ax_field.set_aspect('equal')
            ax_field.set_facecolor('#0d5f20')
            
            # End zones
            endzone = Rectangle((0, 0), 10, FIELD_WIDTH, facecolor='navy', alpha=0.5)
            ax_field.add_patch(endzone)
            endzone2 = Rectangle((FIELD_LENGTH - 10, 0), 10, FIELD_WIDTH, facecolor='navy', alpha=0.5)
            ax_field.add_patch(endzone2)
            
            # Yard lines
            for yard in range(10, int(FIELD_LENGTH - 10) + 1, 5):
                ax_field.axvline(x=yard, color='white', linewidth=0.5, alpha=0.3)
                if yard % 10 == 0:
                    ax_field.text(yard, FIELD_WIDTH/2, str(yard), ha='center', va='center',
                                 color='white', fontsize=8, fontweight='bold',
                                 bbox=dict(boxstyle='round', facecolor='black', alpha=0.5))
        
        # Animation function
        # Create continuous frame sequence: input frames first, then output frames
        # Since frame_id overlaps (both start at 1), we need to handle them separately
        input_frames_vis = play_frames[play_frames["frame_type"] == "input"].sort_values("frame_id")
        output_frames_vis = play_frames[play_frames["frame_type"] == "output"].sort_values("frame_id")
        
        # Create list of (frame_id, frame_type) tuples for proper sequencing
        frames_list = []
        for _, row in input_frames_vis.iterrows():
            frames_list.append((row["frame_id"], "input", row.name))
        for _, row in output_frames_vis.iterrows():
            frames_list.append((row["frame_id"], "output", row.name))
        
        total_frames = len(frames_list)
        
        def animate(frame_idx):
            frame_id, frame_type, row_idx = frames_list[frame_idx]
            
            # Draw field
            draw_field()
            
            # Get frame data - need to match both frame_id AND frame_type
            frame_data = play_frames[
                (play_frames["frame_id"] == frame_id) & 
                (play_frames["frame_type"] == frame_type)
            ]
            frame_metric = frame_metrics[
                (frame_metrics["frame_id"] == frame_id) & 
                (frame_metrics["frame_type"] == frame_type)
            ]
            
            if not frame_metric.empty:
                current_sep = frame_metric.iloc[0]["sep_nearest"]
                # Calculate running average up to current continuous frame
                running_avg = frame_metrics.iloc[:frame_idx+1]["sep_nearest"].mean()
            else:
                current_sep = 0.0
                running_avg = 0.0
            
            # Plot all players with color-coding based on separation (for targeted receiver)
            for _, player in frame_data.iterrows():
                is_target = player["nfl_id"] == target_nfl_id
                
                if is_target:
                    # Color-code targeted receiver by separation
                    if not frame_metric.empty:
                        sep = frame_metric.iloc[0]["sep_nearest"]
                        # Color scale: red (low separation) -> yellow (medium) -> green (high separation)
                        if sep < 2.0:
                            target_color = 'red'
                        elif sep < 5.0:
                            target_color = 'yellow'
                        else:
                            target_color = 'lime'
                    else:
                        target_color = 'yellow'
                        sep = 0.0
                    
                    # Highlight targeted receiver with separation-based color
                    ax_field.scatter(player["x_std"], player["y_std"], 
                                   c=target_color, s=400, marker='*',
                                   edgecolors='black', linewidths=3, zorder=10)
                    
                    # Add separation text near receiver
                    ax_field.annotate(f'{sep:.1f}', 
                                    (player["x_std"], player["y_std"]),
                                    xytext=(0, 15), textcoords='offset points',
                                    fontsize=12, fontweight='bold',
                                    bbox=dict(boxstyle='round,pad=0.5', 
                                            facecolor='white', alpha=0.9,
                                            edgecolor='black', linewidth=2),
                                    ha='center', zorder=11)
                else:
                    # Regular players
                    color = 'orange' if player["player_side"] == "Offense" else 'blue'
                    ax_field.scatter(player["x_std"], player["y_std"],
                                   c=color, s=150, alpha=0.7, zorder=5)
            
            # Plot ball landing
            ax_field.scatter(ball_land_x, ball_land_y, c='yellow', s=400,
                           marker='X', edgecolors='black', linewidths=2, zorder=9)
            
            # Draw separation circle for targeted receiver with separation-based color
            if not frame_metric.empty:
                target_row = frame_data[frame_data["nfl_id"] == target_nfl_id]
                if not target_row.empty:
                    target_x = target_row.iloc[0]["x_std"]
                    target_y = target_row.iloc[0]["y_std"]
                    sep = frame_metric.iloc[0]["sep_nearest"]
                    
                    if sep < np.inf:
                        # Color circle based on separation
                        if sep < 2.0:
                            circle_color = 'red'
                        elif sep < 5.0:
                            circle_color = 'orange'
                        else:
                            circle_color = 'green'
                        
                        circle = Circle((target_x, target_y), sep, fill=False,
                                      edgecolor=circle_color, linewidth=2.5, 
                                      linestyle='--', alpha=0.7)
                        ax_field.add_patch(circle)
                        
                        # Draw line to nearest defender
                        defenders = frame_data[frame_data["player_side"] == "Defense"]
                        if not defenders.empty:
                            def_dists = np.sqrt(
                                (defenders["x_std"] - target_x)**2 +
                                (defenders["y_std"] - target_y)**2
                            )
                            nearest_idx = def_dists.idxmin()
                            nearest_def = defenders.loc[nearest_idx]
                            ax_field.plot([target_x, nearest_def["x_std"]], 
                                         [target_y, nearest_def["y_std"]],
                                         color=circle_color, linewidth=2, 
                                         linestyle=':', alpha=0.6, zorder=1)
            
            # Title - show continuous frame number and frame type
            continuous_frame_num = frame_idx + 1
            ax_field.set_title(
                f"Game {game_id}, Play {play_id} | Frame {continuous_frame_num}/{total_frames} ({frame_type}) | "
                f"Route: {route} | Result: {pass_result}",
                fontsize=12, fontweight='bold', color='white', pad=10
            )
            
            # Update score plot - use continuous frame numbers
            ax_score.clear()
            # Create continuous frame numbers for plotting
            frame_metrics_plot = frame_metrics.copy()
            frame_metrics_plot['continuous_frame'] = range(1, len(frame_metrics_plot) + 1)
            
            ax_score.plot(frame_metrics_plot["continuous_frame"], frame_metrics_plot["sep_nearest"],
                         'b-', linewidth=2, label='Separation')
            ax_score.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=2)
            ax_score.set_xlabel('Frame Number (Continuous)')
            ax_score.set_ylabel('Separation (yards)')
            ax_score.set_title('Frame-by-Frame Separation')
            ax_score.grid(True, alpha=0.3)
            ax_score.legend()
            
            # Update running average
            ax_running.clear()
            running_avgs = []
            # Calculate running average up to current continuous frame
            for i in range(continuous_frame_num):
                if i < len(frame_metrics_plot):
                    avg = frame_metrics_plot.iloc[:i+1]["sep_nearest"].mean()
                    running_avgs.append(avg)
            
            if running_avgs:
                ax_running.plot(range(1, len(running_avgs)+1), running_avgs,
                              'g-', linewidth=2, marker='o', markersize=4, label='Running Average')
                ax_running.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=2)
                ax_running.axhline(y=running_avgs[-1], color='r', linestyle='--', linewidth=2, alpha=0.5)
            ax_running.set_xlabel('Frame Number (Continuous)')
            ax_running.set_ylabel('Cumulative Avg Separation')
            ax_running.set_title(f'Running Average: {running_avg:.2f} yds')
            ax_running.grid(True, alpha=0.3)
            ax_running.legend()
            
            # Update info panel
            ax_info.clear()
            ax_info.axis('off')
            
            if not frame_metric.empty:
                info_text = f"""
FRAME {continuous_frame_num}/{total_frames} ({frame_type.upper()})
{'='*40}
Current Separation: {current_sep:.2f} yds
Running Average: {running_avg:.2f} yds

SEPARATION
Nearest Defender: {frame_metric.iloc[0]['sep_nearest']:.2f} yds
Defenders within 3 yds: {frame_metric.iloc[0]['num_def_within_3']}

MOTION
Speed: {frame_metric.iloc[0]['receiver_speed']:.2f} yds/s
Acceleration: {frame_metric.iloc[0]['receiver_accel']:.2f} yds/s²

BALL PROXIMITY
Distance to Ball: {frame_metric.iloc[0]['dist_to_ball']:.2f} yds
"""
            else:
                info_text = f"Frame {continuous_frame_num}/{total_frames} ({frame_type})\nNo metrics available"
            
            # Add separation indicator box (large, prominent)
            sep_box_text = f"SEPARATION: {current_sep:.2f} yds"
            if current_sep < 2.0:
                sep_box_color = 'red'
            elif current_sep < 5.0:
                sep_box_color = 'yellow'
            else:
                sep_box_color = 'lime'
            
            ax_info.text(0.5, 0.98, sep_box_text, transform=ax_info.transAxes,
                        fontsize=16, fontweight='bold', ha='center',
                        verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round,pad=1', facecolor=sep_box_color, 
                                alpha=0.8, edgecolor='black', linewidth=3))
            
            ax_info.text(0.05, 0.85, info_text, transform=ax_info.transAxes,
                        fontsize=10, verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.9))
        
        # Create animation
        # Use continuous frame numbering for display
        total_frames = len(frames_list)
        print(f"Creating animation with {total_frames} frames ({len(input_frames_vis)} input + {len(output_frames_vis)} output)")
        
        anim = animation.FuncAnimation(fig, animate, frames=total_frames,
                                      interval=200, repeat=True)
        
        if save_path:
            print(f"Saving animation to {save_path}...")
            try:
                anim.save(save_path, writer='pillow', fps=5)
                print(f"✓ Animation saved successfully!")
            except Exception as e:
                print(f"Error saving animation: {e}")
                print("Trying with imagemagick writer...")
                try:
                    anim.save(save_path.replace('.gif', '.mp4'), writer='ffmpeg', fps=5)
                    print(f"✓ Saved as MP4 instead")
                except:
                    print("Could not save animation. Displaying instead...")
        
        if show_animation:
            plt.tight_layout()
            plt.show()
        
        return anim, frame_metrics


def prepare_lstm_features(scorer: RouteDominanceScorer, 
                          game_ids: List[int], play_ids: List[int]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Prepare features for LSTM model
    
    Args:
        scorer: RouteDominanceScorer instance
        game_ids: List of game IDs
        play_ids: List of play IDs
        
    Returns:
        X: Feature sequences (n_samples, n_frames, n_features)
        y: Route dominance scores (n_samples,)
    """
    sequences = []
    targets = []
    
    for game_id, play_id in zip(game_ids, play_ids):
        # Get targeted receiver
        play_input = scorer.input_df[
            (scorer.input_df["game_id"] == game_id) &
            (scorer.input_df["play_id"] == play_id) &
            (scorer.input_df["player_role"] == "Targeted Receiver")
        ]
        
        if play_input.empty:
            continue
        
        target_nfl_id = play_input["nfl_id"].iloc[0]
        
        # Calculate frame metrics
        try:
            frame_metrics = scorer.calculate_frame_dominance(game_id, play_id, target_nfl_id)
        except:
            continue
        
        # Extract features for each frame
        feature_cols = [
            'sep_nearest', 'sep_second', 'num_def_within_2', 'num_def_within_3',
            'num_def_within_5', 'receiver_speed', 'receiver_accel', 'dist_to_ball'
        ]
        
        # Fill NaN values
        frame_features = frame_metrics[feature_cols].fillna(0).values
        
        # Pad or truncate to fixed length (e.g., 30 frames)
        max_frames = 30
        if len(frame_features) < max_frames:
            padding = np.zeros((max_frames - len(frame_features), len(feature_cols)))
            frame_features = np.vstack([frame_features, padding])
        else:
            frame_features = frame_features[:max_frames]
        
        sequences.append(frame_features)
        
        # Use average separation as route-level metric
        route_dom = frame_metrics["sep_nearest"].mean()
        targets.append(route_dom)
    
    X = np.array(sequences)
    y = np.array(targets)
    
    return X, y



In [6]:
# interactive_route_control_visualization.py
"""
Interactive Route Dominance Visualizer

Navigate frame-by-frame through a play using arrow keys:
- Left Arrow: Previous frame
- Right Arrow: Next frame
- Up Arrow: Jump to first frame
- Down Arrow: Jump to last frame
- 'q' or Escape: Quit
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, FancyArrowPatch, Arc, Arrow
import sys

class InteractiveRouteDominanceViewer:
    """Interactive frame-by-frame viewer for route dominance"""
    
    def __init__(self, scorer, game_id, play_id, target_nfl_id):
        self.scorer = scorer
        self.game_id = game_id
        self.play_id = play_id
        self.target_nfl_id = target_nfl_id
        
        # Calculate frame metrics
        print("Calculating frame-by-frame dominance...")
        self.frame_metrics = scorer.calculate_frame_dominance(game_id, play_id, target_nfl_id)
        
        # Get all frames for the play (all players for visualization)
        self.play_frames = scorer.all_frames_df[
            (scorer.all_frames_df["game_id"] == game_id) &
            (scorer.all_frames_df["play_id"] == play_id)
        ]
        
        # Get frames for targeted receiver only (for sequencing)
        target_frames = self.play_frames[
            self.play_frames["nfl_id"] == target_nfl_id
        ]
        
        # Separate input and output frames for targeted receiver
        input_frames = target_frames[target_frames["frame_type"] == "input"].sort_values("frame_id")
        output_frames = target_frames[target_frames["frame_type"] == "output"].sort_values("frame_id")
        
        # Create continuous frame list
        self.frames_list = []
        for _, row in input_frames.iterrows():
            self.frames_list.append((row["frame_id"], "input", row.name))
        for _, row in output_frames.iterrows():
            self.frames_list.append((row["frame_id"], "output", row.name))
        
        self.current_frame_idx = 0
        self.total_frames = len(self.frames_list)
        
        # Get play context
        supp_row = scorer.supp_df[
            (scorer.supp_df["game_id"] == game_id) &
            (scorer.supp_df["play_id"] == play_id)
        ]
        if not supp_row.empty:
            self.route = supp_row.iloc[0].get("route_of_targeted_receiver", "UNKNOWN")
            self.pass_result = supp_row.iloc[0].get("pass_result", "UNKNOWN")
            self.offense_formation = supp_row.iloc[0].get("offense_formation", "UNKNOWN")
            self.receiver_alignment = supp_row.iloc[0].get("receiver_alignment", "UNKNOWN")
            self.coverage_type = supp_row.iloc[0].get("team_coverage_type", "UNKNOWN")
            self.pass_length = supp_row.iloc[0].get("pass_length", "UNKNOWN")
            self.down = supp_row.iloc[0].get("down", "UNKNOWN")
            self.yards_to_go = supp_row.iloc[0].get("yards_to_go", "UNKNOWN")
        else:
            self.route = "UNKNOWN"
            self.pass_result = "UNKNOWN"
            self.offense_formation = "UNKNOWN"
            self.receiver_alignment = "UNKNOWN"
            self.coverage_type = "UNKNOWN"
            self.pass_length = "UNKNOWN"
            self.down = "UNKNOWN"
            self.yards_to_go = "UNKNOWN"
        
        # Get ball landing
        self.ball_land_x = self.play_frames["ball_land_x_std"].iloc[0]
        self.ball_land_y = self.play_frames["ball_land_y_std"].iloc[0]
        
        # Get target name
        target_info = self.play_frames[
            (self.play_frames["nfl_id"] == target_nfl_id) &
            (self.play_frames["frame_type"] == "input")
        ]
        if not target_info.empty:
            self.target_name = target_info["player_name"].iloc[0]
        else:
            self.target_name = f"Player {target_nfl_id}"
        
        # Create figure with more space for info panel
        self.fig = plt.figure(figsize=(22, 14))
        self.fig.canvas.mpl_connect('key_press_event', self.on_key_press)
        
        # Create subplots - field on left, metrics on right, info panel below field
        self.ax_field = plt.subplot2grid((4, 4), (0, 0), colspan=3, rowspan=3)
        self.ax_score = plt.subplot2grid((4, 4), (0, 3))
        self.ax_running = plt.subplot2grid((4, 4), (1, 3))
        self.ax_info = plt.subplot2grid((4, 4), (3, 0), colspan=4)  # Full width at bottom
        self.ax_info.axis('off')
        
        # Draw initial frame
        self.update_display()
        
        # Instructions
        print("\n" + "="*80)
        print("INTERACTIVE ROUTE DOMINANCE VIEWER")
        print("="*80)
        print("Controls:")
        print("  Left Arrow  : Previous frame")
        print("  Right Arrow : Next frame")
        print("  Up Arrow    : Jump to first frame")
        print("  Down Arrow  : Jump to last frame")
        print("  'q' or Esc  : Quit")
        print("="*80)
        print(f"\nTotal frames: {self.total_frames}")
        print(f"Current frame: {self.current_frame_idx + 1}/{self.total_frames}")
        print("\nClick on the plot window and use arrow keys to navigate!")
        print("="*80)
        
        plt.tight_layout()
        plt.show()
    
    def on_key_press(self, event):
        """Handle keyboard input"""
        if event.key == 'left' or event.key == 'backspace':
            self.current_frame_idx = max(0, self.current_frame_idx - 1)
            self.update_display()
        elif event.key == 'right' or event.key == ' ':
            self.current_frame_idx = min(self.total_frames - 1, self.current_frame_idx + 1)
            self.update_display()
        elif event.key == 'up':
            self.current_frame_idx = 0
            self.update_display()
        elif event.key == 'down':
            self.current_frame_idx = self.total_frames - 1
            self.update_display()
        elif event.key == 'q' or event.key == 'escape':
            plt.close(self.fig)
            sys.exit(0)
    
    def draw_field(self):
        """Draw the football field"""
        self.ax_field.clear()
        self.ax_field.set_xlim(-5, 120 + 5)
        self.ax_field.set_ylim(-5, 53.3 + 5)
        self.ax_field.set_aspect('equal')
        self.ax_field.set_facecolor('#0d5f20')
        
        # End zones
        endzone1 = Rectangle((0, 0), 10, 53.3, facecolor='navy', alpha=0.5)
        endzone2 = Rectangle((110, 0), 10, 53.3, facecolor='navy', alpha=0.5)
        self.ax_field.add_patch(endzone1)
        self.ax_field.add_patch(endzone2)
        
        # Yard lines
        for yard in range(10, 110 + 1, 5):
            self.ax_field.axvline(x=yard, color='white', linewidth=0.5, alpha=0.3)
            if yard % 10 == 0:
                self.ax_field.text(yard, 53.3/2, str(yard), ha='center', va='center',
                                 color='white', fontsize=8, fontweight='bold',
                                 bbox=dict(boxstyle='round', facecolor='black', alpha=0.5))
    
    def update_display(self):
        """Update the display for current frame"""
        frame_id, frame_type, row_idx = self.frames_list[self.current_frame_idx]
        
        # Draw field
        self.draw_field()
        
        # Get frame data
        frame_data = self.play_frames[
            (self.play_frames["frame_id"] == frame_id) & 
            (self.play_frames["frame_type"] == frame_type)
        ]
        
        frame_metric = self.frame_metrics[
            (self.frame_metrics["frame_id"] == frame_id) & 
            (self.frame_metrics["frame_type"] == frame_type)
        ]
        
        if not frame_metric.empty:
            current_sep = frame_metric.iloc[0]["sep_nearest"]
            running_avg = self.frame_metrics.iloc[:self.current_frame_idx+1]["sep_nearest"].mean()
        else:
            current_sep = 0.0
            running_avg = 0.0
        
        # Plot all players
        # First, identify which players are in this frame
        players_in_frame = frame_data["nfl_id"].unique()
        
        for _, player in frame_data.iterrows():
            is_target = player["nfl_id"] == self.target_nfl_id
            
            # Get player side (may be NaN for output-only players, default to Defense)
            player_side = player.get("player_side", "Defense")
            if pd.isna(player_side):
                # If player_side is missing, try to infer from player_role or default
                player_role = player.get("player_role", "")
                if "Receiver" in str(player_role) or "Offense" in str(player_role):
                    player_side = "Offense"
                else:
                    player_side = "Defense"
            
            if is_target:
                # Color-code targeted receiver by separation
                if not frame_metric.empty:
                    sep = frame_metric.iloc[0]["sep_nearest"]
                    if sep < 2.0:
                        target_color = 'red'
                    elif sep < 5.0:
                        target_color = 'yellow'
                    else:
                        target_color = 'lime'
                else:
                    target_color = 'yellow'
                    sep = 0.0
                
                # Highlight targeted receiver (same size as other players)
                marker_size = 150
                if frame_type == "output":
                    marker_size = 180
                
                self.ax_field.scatter(player["x_std"], player["y_std"], 
                                   c=target_color, s=marker_size, marker='o',
                                   edgecolors='black', linewidths=3, zorder=10)
                
                # Add direction arrow for targeted receiver
                player_vx = player.get("vx", 0)
                player_vy = player.get("vy", 0)
                if pd.notna(player_vx) and pd.notna(player_vy) and (player_vx != 0 or player_vy != 0):
                    # Scale arrow length based on speed (normalize to reasonable size)
                    speed = np.sqrt(player_vx**2 + player_vy**2)
                    arrow_length = min(speed * 0.3, 2.0)  # Max 2 yards
                    if arrow_length > 0.1:  # Only draw if moving
                        arrow_dx = (player_vx / speed) * arrow_length
                        arrow_dy = (player_vy / speed) * arrow_length
                        arrow = FancyArrowPatch(
                            (player["x_std"], player["y_std"]),
                            (player["x_std"] + arrow_dx, player["y_std"] + arrow_dy),
                            arrowstyle='->', mutation_scale=15, linewidth=2.5,
                            color='black', alpha=0.9, zorder=11
                        )
                        self.ax_field.add_patch(arrow)
            else:
                # Regular players - show all players from output file
                color = 'orange' if player_side == "Offense" else 'blue'
                marker_size = 150
                
                # Make output frame players slightly larger/more visible
                if frame_type == "output":
                    marker_size = 180
                
                self.ax_field.scatter(player["x_std"], player["y_std"],
                                   c=color, s=marker_size, alpha=0.8, zorder=5,
                                   edgecolors='white', linewidths=1)
                
                # Add direction arrow for regular players
                player_vx = player.get("vx", 0)
                player_vy = player.get("vy", 0)
                if pd.notna(player_vx) and pd.notna(player_vy) and (player_vx != 0 or player_vy != 0):
                    # Scale arrow length based on speed (normalize to reasonable size)
                    speed = np.sqrt(player_vx**2 + player_vy**2)
                    arrow_length = min(speed * 0.25, 1.5)  # Max 1.5 yards, smaller than target
                    if arrow_length > 0.1:  # Only draw if moving
                        arrow_dx = (player_vx / speed) * arrow_length
                        arrow_dy = (player_vy / speed) * arrow_length
                        arrow = FancyArrowPatch(
                            (player["x_std"], player["y_std"]),
                            (player["x_std"] + arrow_dx, player["y_std"] + arrow_dy),
                            arrowstyle='->', mutation_scale=12, linewidth=2,
                            color='white', alpha=0.8, zorder=6
                        )
                        self.ax_field.add_patch(arrow)
                
                # Optionally add player ID label for output frames
                if frame_type == "output":
                    player_name = player.get("player_name", f"ID:{player['nfl_id']}")
                    if pd.isna(player_name) or player_name == "":
                        player_name = f"ID:{player['nfl_id']}"
                    self.ax_field.annotate(player_name.split()[-1] if len(str(player_name).split()) > 1 else str(player_name)[:4],
                                        (player["x_std"], player["y_std"]),
                                        xytext=(0, -15), textcoords='offset points',
                                        fontsize=8, color='white',
                                        bbox=dict(boxstyle='round,pad=0.3', 
                                                facecolor=color, alpha=0.7,
                                                edgecolor='white', linewidth=1),
                                        ha='center', zorder=6)
        
        # Plot ball landing
        self.ax_field.scatter(self.ball_land_x, self.ball_land_y, c='yellow', s=600,
                           marker='X', edgecolors='black', linewidths=3, zorder=9,
                           label='Ball Landing')
        
        # Draw separation circle, defender line, and leverage angle
        if not frame_metric.empty:
            target_row = frame_data[frame_data["nfl_id"] == self.target_nfl_id]
            if not target_row.empty:
                target_x = target_row.iloc[0]["x_std"]
                target_y = target_row.iloc[0]["y_std"]
                sep = frame_metric.iloc[0]["sep_nearest"]
                leverage_angle = frame_metric.iloc[0].get("leverage_angle", np.nan)
                
                if sep < np.inf:
                    # Color circle based on separation
                    if sep < 2.0:
                        circle_color = 'red'
                    elif sep < 5.0:
                        circle_color = 'orange'
                    else:
                        circle_color = 'green'
                    
                    circle = Circle((target_x, target_y), sep, fill=False,
                                  edgecolor=circle_color, linewidth=3, 
                                  linestyle='--', alpha=0.7)
                    self.ax_field.add_patch(circle)
                    
                    # Draw line to nearest defender
                    defenders = frame_data[frame_data["player_side"] == "Defense"]
                    if not defenders.empty:
                        def_dists = np.sqrt(
                            (defenders["x_std"] - target_x)**2 +
                            (defenders["y_std"] - target_y)**2
                        )
                        nearest_idx = def_dists.idxmin()
                        nearest_def = defenders.loc[nearest_idx]
                        nearest_def_x = nearest_def["x_std"]
                        nearest_def_y = nearest_def["y_std"]
                        
                        # Draw line from defender to receiver
                        self.ax_field.plot([nearest_def_x, target_x], 
                                         [nearest_def_y, target_y],
                                         color=circle_color, linewidth=2.5, 
                                         linestyle=':', alpha=0.7, zorder=1,
                                         label='Defender-to-Receiver')
                        
                        # Draw line from receiver to ball
                        self.ax_field.plot([target_x, self.ball_land_x], 
                                         [target_y, self.ball_land_y],
                                         color='yellow', linewidth=2.5, 
                                         linestyle='-', alpha=0.8, zorder=1,
                                         label='Receiver-to-Ball')
                        
                        # Calculate and display leverage angle
                        if not np.isnan(leverage_angle):
                            # Calculate angles for arc drawing (in degrees, measured from positive x-axis)
                            # Angle from defender-to-receiver vector (from defender to receiver)
                            def_to_rec_angle_deg = np.rad2deg(np.arctan2(target_y - nearest_def_y, 
                                                                        target_x - nearest_def_x))
                            # Angle from receiver-to-ball vector
                            rec_to_ball_angle_deg = np.rad2deg(np.arctan2(self.ball_land_y - target_y,
                                                                          self.ball_land_x - target_x))
                            
                            # Normalize angles to [0, 360) for Arc
                            def_to_rec_angle_deg = def_to_rec_angle_deg % 360
                            rec_to_ball_angle_deg = rec_to_ball_angle_deg % 360
                            
                            # Draw arc to visualize the angle at receiver position
                            arc_radius = min(sep * 0.4, 4.0)  # Scale arc size
                            
                            # Determine which angle is smaller to draw the correct arc
                            if abs(rec_to_ball_angle_deg - def_to_rec_angle_deg) > 180:
                                # Need to wrap around
                                if rec_to_ball_angle_deg < def_to_rec_angle_deg:
                                    theta1 = rec_to_ball_angle_deg
                                    theta2 = def_to_rec_angle_deg
                                else:
                                    theta1 = def_to_rec_angle_deg
                                    theta2 = rec_to_ball_angle_deg
                            else:
                                theta1 = min(def_to_rec_angle_deg, rec_to_ball_angle_deg)
                                theta2 = max(def_to_rec_angle_deg, rec_to_ball_angle_deg)
                            
                            arc = Arc((target_x, target_y), arc_radius*2, arc_radius*2,
                                     angle=0, theta1=theta1, theta2=theta2,
                                     color='cyan', linewidth=3, alpha=0.9, zorder=2)
                            self.ax_field.add_patch(arc)
                            
                            # Leverage angle is shown in info panel, not on field to save space
        
        # Title with formation info
        continuous_frame_num = self.current_frame_idx + 1
        title_text = (
            f"{self.target_name} | Frame {continuous_frame_num}/{self.total_frames} ({frame_type.upper()})\n"
            f"Route: {self.route} | Formation: {self.offense_formation} {self.receiver_alignment} | "
            f"Coverage: {self.coverage_type} | Result: {self.pass_result}"
        )
        self.ax_field.set_title(
            title_text,
            fontsize=12, fontweight='bold', color='white', pad=10
        )
        self.ax_field.set_xlabel('X Position (yards)', fontsize=11, color='white')
        self.ax_field.set_ylabel('Y Position (yards)', fontsize=11, color='white')
        
        # Update score plot
        self.ax_score.clear()
        frame_metrics_plot = self.frame_metrics.copy()
        frame_metrics_plot['continuous_frame'] = range(1, len(frame_metrics_plot) + 1)
        
        self.ax_score.plot(frame_metrics_plot["continuous_frame"], frame_metrics_plot["sep_nearest"],
                         'b-', linewidth=2.5, marker='o', markersize=4, label='Separation')
        self.ax_score.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=3)
        self.ax_score.scatter([continuous_frame_num], [current_sep], 
                            c='red', s=200, marker='*', zorder=5, label='Current')
        self.ax_score.set_xlabel('Frame Number', fontsize=11)
        self.ax_score.set_ylabel('Separation (yards)', fontsize=11)
        self.ax_score.set_title('Frame-by-Frame Separation', fontsize=12, fontweight='bold')
        self.ax_score.grid(True, alpha=0.3)
        self.ax_score.legend(fontsize=9)
        
        # Update running average
        self.ax_running.clear()
        running_avgs = []
        for i in range(continuous_frame_num):
            if i < len(frame_metrics_plot):
                avg = frame_metrics_plot.iloc[:i+1]["sep_nearest"].mean()
                running_avgs.append(avg)
        
        if running_avgs:
            self.ax_running.plot(range(1, len(running_avgs)+1), running_avgs,
                              'g-', linewidth=2.5, marker='o', markersize=5, label='Running Average')
            self.ax_running.axvline(x=continuous_frame_num, color='r', linestyle='--', linewidth=2)
            self.ax_running.axhline(y=running_avgs[-1], color='r', linestyle='--', linewidth=2, alpha=0.5)
        self.ax_running.set_xlabel('Frame Number', fontsize=11)
        self.ax_running.set_ylabel('Cumulative Avg', fontsize=11)
        self.ax_running.set_title(f'Running Average: {running_avg:.2f} yds', fontsize=12, fontweight='bold')
        self.ax_running.grid(True, alpha=0.3)
        self.ax_running.legend(fontsize=9)
        
        # Update info panel
        self.ax_info.clear()
        self.ax_info.axis('off')
        
        if not frame_metric.empty:
            # Large separation box
            sep_box_text = f"SEPARATION: {current_sep:.2f} yds"
            if current_sep < 2.0:
                sep_box_color = 'red'
            elif current_sep < 5.0:
                sep_box_color = 'yellow'
            else:
                sep_box_color = 'lime'
            
            self.ax_info.text(0.5, 0.98, sep_box_text, transform=self.ax_info.transAxes,
                            fontsize=18, fontweight='bold', ha='center',
                            verticalalignment='top', family='monospace',
                            bbox=dict(boxstyle='round,pad=1.2', facecolor=sep_box_color, 
                                    alpha=0.9, edgecolor='black', linewidth=3))
            
            # Detailed metrics
            leverage_angle = frame_metric.iloc[0].get('leverage_angle', np.nan)
            leverage_str = f"{leverage_angle:.1f}°" if not np.isnan(leverage_angle) else "N/A"
            
            # Split metrics into two columns for better visibility
            info_text_left = f"""FRAME {continuous_frame_num}/{self.total_frames} ({frame_type.upper()})
{'='*60}
Current Separation: {current_sep:.2f} yds
Running Average: {running_avg:.2f} yds

FORMATION INFO
Offense: {self.offense_formation} {self.receiver_alignment}
Coverage: {self.coverage_type}
Down & Distance: {self.down} & {self.yards_to_go}
Pass Length: {self.pass_length} yds
Route: {self.route}

SEPARATION
Nearest Defender: {frame_metric.iloc[0]['sep_nearest']:.2f} yds
Defenders within 2 yds: {frame_metric.iloc[0]['num_def_within_2']}
Defenders within 3 yds: {frame_metric.iloc[0]['num_def_within_3']}
Defenders within 5 yds: {frame_metric.iloc[0]['num_def_within_5']}
"""
            
            info_text_right = f"""
LEVERAGE ANGLE
Angle: {leverage_str}
(Larger = Better: Defender in front)

MOTION
Speed: {frame_metric.iloc[0]['receiver_speed']:.2f} yds/s
Acceleration: {frame_metric.iloc[0]['receiver_accel']:.2f} yds/s²

BALL PROXIMITY
Distance to Ball: {frame_metric.iloc[0]['dist_to_ball']:.2f} yds

CONTROLS
Left/Right: Navigate frames
Up/Down: Jump to start/end
Q/Esc: Quit
"""
            
            # Display metrics in two columns below the dominance box
            self.ax_info.text(0.02, 0.70, info_text_left, transform=self.ax_info.transAxes,
                        fontsize=9, verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
            
            self.ax_info.text(0.52, 0.70, info_text_right, transform=self.ax_info.transAxes,
                        fontsize=9, verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8))
        else:
            info_text = f"Frame {continuous_frame_num}/{self.total_frames} ({frame_type})\nNo metrics available"
            self.ax_info.text(0.05, 0.70, info_text, transform=self.ax_info.transAxes,
                        fontsize=9, verticalalignment='top', family='monospace',
                        bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.9))
        
        # Update figure
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()


def main():
    print("=" * 80)
    print("INTERACTIVE ROUTE DOMINANCE VIEWER")
    print("=" * 80)
    
    # Load data
    print("\nLoading data...")
    input_df = pd.read_csv("data/input_2023_w01.csv")
    output_df = pd.read_csv("data/output_2023_w01.csv")
    supp_df = pd.read_csv("data/Supplementary.csv")
    
    # Initialize scorer
    print("Initializing Route Dominance Scorer...")
    scorer = RouteDominanceScorer(input_df, output_df, supp_df)
    
    # Example play
    game_id = 2023090700
    play_id = 101
    
    # Get targeted receiver
    target_info = input_df[
        (input_df["game_id"] == game_id) &
        (input_df["play_id"] == play_id) &
        (input_df["player_role"] == "Targeted Receiver")
    ]
    
    if target_info.empty:
        print("No targeted receiver found. Using first available play...")
        target_info = input_df[
            input_df["player_role"] == "Targeted Receiver"
        ].iloc[0:1]
        game_id = target_info["game_id"].iloc[0]
        play_id = target_info["play_id"].iloc[0]
    
    target_nfl_id = target_info["nfl_id"].iloc[0]
    target_name = target_info["player_name"].iloc[0]
    
    print(f"\nGame: {game_id}, Play: {play_id}")
    print(f"Targeted Receiver: {target_name} (ID: {target_nfl_id})")
    
    # Create interactive viewer
    viewer = InteractiveRouteDominanceViewer(scorer, game_id, play_id, target_nfl_id)




In [7]:
# (Deprecated) Previous attempt to inline src.create_dominance_gif.
# This cell is intentionally left as comments so it does not change behavior.



In [8]:
# (Deprecated) Previous attempt to inline src.field_control_visualizer.
# This cell is intentionally left as comments so it does not change behavior.



In [9]:
# (Deprecated) Previous attempt to inline train_ecp_model.
# This cell is intentionally left as comments so it does not change behavior.



## 1. Configuration

Configure which weeks to process and preprocessing parameters.


In [10]:
# ============================================================================
# Configuration: Weeks to Process
# ============================================================================
# Options:
#   WEEKS_TO_PROCESS = None           → Process ALL weeks (1-18) - Full dataset
#   WEEKS_TO_PROCESS = [1]            → Process only week 1 - Quick testing
#   WEEKS_TO_PROCESS = [1, 2, 3]      → Process specific weeks
#   WEEKS_TO_PROCESS = list(range(1, 5)) → Process weeks 1-4

WEEKS_TO_PROCESS = list(range(6,10))  # Set to None for all weeks

# ============================================================================
# Configuration: Preprocessing Parameters
# ============================================================================
THROWAWAY_DIST = 8.0  # yards; plays with dist_to_ball > this at arrival will be filtered
EXCLUDE_ROUTE_KEYWORDS = ["screen", "angle"]  # case-insensitive route exclusions

# Display configuration
print("="*80)
if WEEKS_TO_PROCESS is None:
    print("CONFIGURATION: Processing ALL WEEKS (1-18)")
else:
    print(f"CONFIGURATION: Processing WEEKS {WEEKS_TO_PROCESS}")
    print(f"  Total weeks: {len(WEEKS_TO_PROCESS)}")
    print(f"  Week range: {min(WEEKS_TO_PROCESS)} to {max(WEEKS_TO_PROCESS)}")
print(f"\nPreprocessing Settings:")
print(f"  THROWAWAY_DIST: {THROWAWAY_DIST} yards")
print(f"  EXCLUDE_ROUTE_KEYWORDS: {EXCLUDE_ROUTE_KEYWORDS}")
print("="*80)


CONFIGURATION: Processing WEEKS [6, 7, 8, 9]
  Total weeks: 4
  Week range: 6 to 9

Preprocessing Settings:
  THROWAWAY_DIST: 8.0 yards
  EXCLUDE_ROUTE_KEYWORDS: ['screen', 'angle']


## 2. Data Loading

Load input (pre-throw), output (post-throw), and supplementary (play context) data.



In [11]:
# Helper functions to load data files
def load_all_input_files(data_dir="/kaggle/input/nfl-data", weeks=None):
    """Load all input CSV files for specified weeks (default: all weeks 1-18)"""
    if weeks is None:
        weeks = list(range(1, 19))
    
    input_frames = []
    for w in weeks:
        fname = f"{data_dir}/input_2023_w{w:02d}.csv"
        try:
            df = pd.read_csv(fname)
            df["week"] = w
            input_frames.append(df)
            print(f"  ✓ Loaded input_2023_w{w:02d}.csv: {len(df):,} rows")
        except FileNotFoundError:
            print(f"  ⚠ Skipping missing file: {fname}")
        except Exception as e:
            print(f"  ✗ Error loading {fname}: {e}")
    
    if not input_frames:
        raise ValueError("No input files were loaded!")
    
    return pd.concat(input_frames, ignore_index=True)

def load_all_output_files(data_dir="/kaggle/input/nfl-data", weeks=None):
    """Load all output CSV files for specified weeks (default: all weeks 1-18)"""
    if weeks is None:
        weeks = list(range(1, 19))
    
    output_frames = []
    for w in weeks:
        fname = f"{data_dir}/output_2023_w{w:02d}.csv"
        try:
            df = pd.read_csv(fname)
            df["week"] = w
            output_frames.append(df)
            print(f"  ✓ Loaded output_2023_w{w:02d}.csv: {len(df):,} rows")
        except FileNotFoundError:
            print(f"  ⚠ Skipping missing file: {fname}")
        except Exception as e:
            print(f"  ✗ Error loading {fname}: {e}")
    
    if not output_frames:
        raise ValueError("No output files were loaded!")
    
    return pd.concat(output_frames, ignore_index=True)

# Data directory configuration
DATA_DIR = "/kaggle/input/nfl-data"

# Load data files
print("Loading data files...")
print("="*80)

# Load input data (pre-throw)
print("\nLoading INPUT data (pre-throw)...")
input_df = load_all_input_files(data_dir=DATA_DIR, weeks=WEEKS_TO_PROCESS)
print(f"✓ Total input data: {len(input_df):,} rows")
print(f"  Unique plays: {input_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")
print(f"  Weeks: {sorted(input_df['week'].unique())}")

# Load output data (post-throw)
print("\nLoading OUTPUT data (post-throw)...")
output_df = load_all_output_files(data_dir=DATA_DIR, weeks=WEEKS_TO_PROCESS)
print(f"✓ Total output data: {len(output_df):,} rows")
print(f"  Unique plays: {output_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")
print(f"  Weeks: {sorted(output_df['week'].unique())}")

# Load supplementary data (play context)
print("\nLoading SUPPLEMENTARY data...")
supp_df = pd.read_csv(f"{DATA_DIR}/supplementary_data.csv")
print(f"✓ Supplementary data: {len(supp_df):,} rows")
print(f"  Unique plays: {supp_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")

# Apply route-based exclusions (screens/angles) before scoring
print("\n" + "="*80)
print("APPLYING ROUTE FILTERING")
print("="*80)

if "route_of_targeted_receiver" in supp_df.columns:
    route_lower = supp_df["route_of_targeted_receiver"].fillna("").str.lower()
    keyword_pattern = "|".join(EXCLUDE_ROUTE_KEYWORDS)
    exclude_mask = route_lower.str.contains(keyword_pattern)
    excluded_plays = supp_df.loc[exclude_mask, ["game_id", "play_id"]].drop_duplicates()
    
    if not excluded_plays.empty:
        print(f"Excluding {len(excluded_plays)} plays matching keywords: {EXCLUDE_ROUTE_KEYWORDS}")
        
        # Filter supplementary data
        supp_df = supp_df.loc[~exclude_mask].reset_index(drop=True)
        
        # Filter input data
        input_df = input_df.merge(excluded_plays.assign(exclude=1), on=["game_id", "play_id"], how="left")
        input_df = input_df[input_df["exclude"].isna()].drop(columns=["exclude"])
        
        # Filter output data
        output_df = output_df.merge(excluded_plays.assign(exclude=1), on=["game_id", "play_id"], how="left")
        output_df = output_df[output_df["exclude"].isna()].drop(columns=["exclude"])
        
        print(f"✓ Filtered data: {len(supp_df):,} plays remaining")
    else:
        print("No plays excluded by route filter")
else:
    print("⚠ route_of_targeted_receiver not found; skipping route filter")

print("\n" + "="*80)
print("DATA LOADING COMPLETE")
print("="*80)


Loading data files...

Loading INPUT data (pre-throw)...
  ✓ Loaded input_2023_w06.csv: 270,676 rows
  ✓ Loaded input_2023_w07.csv: 233,597 rows
  ✓ Loaded input_2023_w08.csv: 281,011 rows
  ✓ Loaded input_2023_w09.csv: 252,796 rows
✓ Total input data: 1,038,080 rows
  Unique plays: 3,024
  Weeks: [np.int64(6), np.int64(7), np.int64(8), np.int64(9)]

Loading OUTPUT data (post-throw)...
  ✓ Loaded output_2023_w06.csv: 31,162 rows
  ✓ Loaded output_2023_w07.csv: 27,443 rows
  ✓ Loaded output_2023_w08.csv: 33,017 rows
  ✓ Loaded output_2023_w09.csv: 28,291 rows
✓ Total output data: 119,913 rows
  Unique plays: 3,024
  Weeks: [np.int64(6), np.int64(7), np.int64(8), np.int64(9)]

Loading SUPPLEMENTARY data...
✓ Supplementary data: 18,009 rows
  Unique plays: 18,009

APPLYING ROUTE FILTERING
Excluding 1073 plays matching keywords: ['screen', 'angle']
✓ Filtered data: 16,936 plays remaining

DATA LOADING COMPLETE


## 3. Route Break Detection

Detect route breaks (sharp turns) for targeted receivers using geometric analysis.


In [12]:
# Route break detection functions

def get_shortest_angle_diff(angle1, angle2):
    """Calculate the smallest difference between two angles in degrees (-180 to 180)"""
    return (angle2 - angle1 + 180) % 360 - 180

def standardize_coordinates(df):
    """Standardize tracking data coordinates (x, y) so all plays move Left-to-Right"""
    df = df.copy()
    df['std_x'] = df['x']
    df['std_y'] = df['y']
    
    # Flip logic for plays moving left
    mask_left = df['play_direction'] == 'left'
    df.loc[mask_left, 'std_x'] = 120 - df.loc[mask_left, 'x']
    df.loc[mask_left, 'std_y'] = 53.3 - df.loc[mask_left, 'y']
    
    return df

def find_break_geometric(group):
    """Detect route break frame using geometric analysis"""
    group = group.sort_values('frame_id').reset_index(drop=True)
    
    # Check for minimum required frames
    if len(group) < 7:
        return np.nan
    
    # Smooth the standardized path
    window = min(7, len(group) if len(group) % 2 != 0 else len(group) - 1)
    if window < 3:
        return np.nan
    
    sx_smooth = savgol_filter(group['std_x'], window, 2)
    sy_smooth = savgol_filter(group['std_y'], window, 2)
    
    # Calculate Velocity and Direction
    vx = np.diff(sx_smooth, prepend=sx_smooth[0])
    vy = np.diff(sy_smooth, prepend=sy_smooth[0])
    
    # Use smoothed speed for the mask
    speed_smooth = np.sqrt(vx**2 + vy**2)
    
    # Calculate angle of motion (heading)
    heading = np.degrees(np.arctan2(vx, vy))
    
    # Calculate Angular Velocity
    heading_diff = np.zeros_like(heading)
    for i in range(1, len(heading)):
        heading_diff[i] = get_shortest_angle_diff(heading[i-1], heading[i])
    
    # Define Curvature Metric
    curvature = np.abs(heading_diff)
    
    # Mask out slow/standing movements (speed < 0.1 yd/sec)
    moving_mask = speed_smooth > 0.1
    curvature[~moving_mask] = 0
    
    # Ignore the first 5 frames (Line-of-Scrimmage Movement / Release)
    search_start_index = 5
    
    # Find the Peak Break
    search_curvature = curvature[search_start_index:]
    if len(search_curvature) == 0:
        return np.nan
    
    # Find the index of max curvature within the search window
    relative_index = np.argmax(search_curvature)
    max_curve_index = relative_index + search_start_index
    max_curve_val = curvature[max_curve_index]
    
    # Threshold: Must be a sharp turn (> 8 degrees/frame)
    if max_curve_val < 8.0:
        return np.nan
    
    return group.loc[max_curve_index, 'frame_id']

# Apply route break detection
print("\n" + "="*80)
print("FEATURE GENERATION: DETECTING ROUTE BREAK FRAME")
print("="*80)

# Define the grouping keys
GROUP_KEYS = ['game_id', 'play_id', 'nfl_id']

# Standardize Coordinates in Input Data
input_df = standardize_coordinates(input_df)

# Apply the function to every unique targeted receiver
print("Starting route break detection using groupby().apply()...")

# Filter for Targeted Receivers and apply the function
route_break_frames = input_df[
    input_df['player_role'] == 'Targeted Receiver'
].groupby(GROUP_KEYS).apply(find_break_geometric).reset_index(name='play_break_frame')

# Merge the results back into the full tracking data
input_df = pd.merge(
    input_df, 
    route_break_frames, 
    on=GROUP_KEYS,
    how='left'
)

# Clean up standardization columns
input_df.drop(columns=['std_x', 'std_y'], inplace=True, errors='ignore')

print(f"✓ Route break detection complete.")
print(f"  {route_break_frames['play_break_frame'].notna().sum():,} breaks identified out of {len(route_break_frames):,} targeted receivers")
print(f"✓ 'play_break_frame' column added to input_df.")
print("="*80)



FEATURE GENERATION: DETECTING ROUTE BREAK FRAME
Starting route break detection using groupby().apply()...
✓ Route break detection complete.
  606 breaks identified out of 2,838 targeted receivers
✓ 'play_break_frame' column added to input_df.


## 3. Data Exploration

Quick exploration of the loaded data structure.


In [13]:
# Find plays with targeted receivers
targeted_plays = input_df[
    input_df["player_role"] == "Targeted Receiver"
][["game_id", "play_id", "nfl_id", "player_name"]].drop_duplicates()

print(f"Total plays with targeted receivers: {len(targeted_plays):,}")

# Frame count analysis
print("\n" + "="*80)
print("FRAME COUNT ANALYSIS")
print("="*80)

input_frame_counts = input_df.groupby(["game_id", "play_id"]).size()
output_frame_counts = output_df.groupby(["game_id", "play_id"]).size()

print(f"Input frames per play:")
print(f"  Average: {input_frame_counts.mean():.1f}")
print(f"  Min: {input_frame_counts.min()}, Max: {input_frame_counts.max()}")

print(f"\nOutput frames per play:")
print(f"  Average: {output_frame_counts.mean():.1f}")
print(f"  Min: {output_frame_counts.min()}, Max: {output_frame_counts.max()}")

# Completion status
print(f"\n" + "="*80)
print("COMPLETION STATUS")
print("="*80)
if "pass_result" in supp_df.columns:
    completion_counts = supp_df["pass_result"].value_counts()
    print(completion_counts)
    print(f"\nCompletion rate: {(completion_counts.get('C', 0) / len(supp_df) * 100):.1f}%")


Total plays with targeted receivers: 2,838

FRAME COUNT ANALYSIS
Input frames per play:
  Average: 341.6
  Min: 110, Max: 1378

Output frames per play:
  Average: 41.3
  Min: 5, Max: 216

COMPLETION STATUS
pass_result
C     11575
I      4941
IN      420
Name: count, dtype: int64

Completion rate: 68.3%


## 4. Initialize Route Dominance Scorer

Initialize the scorer which combines input and output data and calculates motion features.


## Legacy Receiver Pressure Calculation (for reference only)

This cell demonstrates how receiver_pressure is calculated using actual numbers from the dataframe.

**Key Changes:**
- Now uses 1st, 2nd, and 3rd nearest defenders (instead of all defenders within 6 yards)
- Added columns: sep_second and sep_third (blank/NaN if no 2nd/3rd defender exists)
- Receiver influence PDF: 6-yard radius, centered 2 yards in front of receiver
- Each defender PDF: 4-yard radius, weighted by separation distance
Let's show the calculation for a sample frame:


In [14]:
# (Legacy) Display sample data with receiver_pressure
# NOTE: Main metric is now field_control; this is kept only for debugging/experiments.
if 'training_df' in globals() and not training_df.empty:
    print("="*80)
    print("RECEIVER PRESSURE CALCULATION - SAMPLE DATA")
    print("="*80)
    
    # Show columns related to receiver pressure
    pressure_cols = [
        'game_id', 'play_id', 'continuous_frame', 'frame_type',
        'receiver_x', 'receiver_y', 
        'sep_nearest', 'sep_second', 'sep_third',
        'receiver_pressure', 'dist_to_ball'
    ]
    
    # Filter to only existing columns
    pressure_cols = [col for col in pressure_cols if col in training_df.columns]
    
    # Show first few rows
    print("\nSample rows with receiver pressure:")
    display(training_df[pressure_cols].head(10))
    
    # Pick a specific frame to demonstrate calculation
    sample_row = training_df.iloc[0]
    
    print("\n" + "="*80)
    print("DETAILED CALCULATION FOR ONE FRAME")
    print("="*80)
    print(f"\nGame ID: {sample_row['game_id']}")
    print(f"Play ID: {sample_row['play_id']}")
    print(f"Frame: {sample_row['continuous_frame']}")
    print(f"Frame Type: {sample_row['frame_type']}")
    print(f"\nReceiver Position: ({sample_row['receiver_x']:.2f}, {sample_row['receiver_y']:.2f})")
    print(f"Distance to Ball: {sample_row['dist_to_ball']:.2f} yards")
    
    print(f"\nDefender Separations:")
    print(f"  1st Nearest: {sample_row['sep_nearest']:.2f} yards")
    if pd.notna(sample_row.get('sep_second', np.nan)):
        print(f"  2nd Nearest: {sample_row['sep_second']:.2f} yards")
    else:
        print(f"  2nd Nearest: (no 2nd defender)")
    if pd.notna(sample_row.get('sep_third', np.nan)):
        print(f"  3rd Nearest: {sample_row['sep_third']:.2f} yards")
    else:
        print(f"  3rd Nearest: (no 3rd defender)")
    
    print(f"\nFinal Receiver Pressure: {sample_row['receiver_pressure']:.4f}")
    
    print("\n" + "="*80)
    print("CALCULATION STEPS:")
    print("="*80)
    print("""
Step 1: Receiver Influence PDF
  - Center: 2 yards in front of receiver, towards ball landing position
  - Radius: 6 yards
  - Creates a 2D Gaussian distribution centered at this shifted point

Step 2: Defender Pressure PDFs (for 1st, 2nd, 3rd nearest defenders within 6 yards)
  - For each of the 3 nearest defenders (if they exist and are within 6 yards):
    * Calculate separation distance
    * Weight: w(s) = 1 / (1 + s/5) where s = separation in yards
    * Create 4-yard radius PDF centered at defender position
    * Multiply PDF by weight
  - Sum all defender PDFs to get total defender_pdf

Step 3: Dominance Ratio
  - dominance_pdf = receiver_pdf / (receiver_pdf + defender_pdf + ε)
  - Values range from 0 (pure defender pressure) to 1 (pure receiver influence)

Step 4: Weighted Average
  - Create receiver area PDF (6-yard radius, centered 2 yards in front)
  - Weight dominance_pdf by receiver_area_pdf
  - Calculate average: receiver_pressure = Σ(weighted_dominance) / Σ(receiver_area_pdf)

Step 5: Normalization
  - Normalize: (val - 0.5) / (0.8 - 0.5)
  - Clamp to [0, 1] range
    """)
    
    # Show frame-by-frame variation for a sample play
    if len(training_df) > 0:
        sample_play = training_df.groupby(['game_id', 'play_id']).first().reset_index().iloc[0]
        play_frames = training_df[
            (training_df['game_id'] == sample_play['game_id']) &
            (training_df['play_id'] == sample_play['play_id'])
        ].sort_values('continuous_frame')
        
        print("\n" + "="*80)
        print(f"FRAME-BY-FRAME RECEIVER PRESSURE (Game {sample_play['game_id']}, Play {sample_play['play_id']})")
        print("="*80)
        frame_display_cols = ['continuous_frame', 'frame_type', 'sep_nearest', 'sep_second', 'sep_third', 'receiver_pressure']
        frame_display_cols = [col for col in frame_display_cols if col in play_frames.columns]
        print(play_frames[frame_display_cols].to_string(index=False))
        
else:
    print("training_df not found. Please run the cells above to create the training dataframe first.")


training_df not found. Please run the cells above to create the training dataframe first.


In [15]:
# (Legacy) Verify receiver_pressure column exists and varies by frame
# NOTE: Main metric is now field_control; this is kept only for debugging/experiments.
print("="*80)
print("RECEIVER PRESSURE VERIFICATION")
print("="*80)

# Check if training_df exists
if 'training_df' not in globals() or training_df is None:
    print("⚠ WARNING: training_df is not defined")
    print("\nPlease run the cell that creates training_df first:")
    print("  training_df, errors = create_training_dataframe(...)")
    print("\nThe receiver_pressure column will be automatically included")
    print("when you regenerate training_df with the updated route_dominance_scoring.py")
elif 'receiver_pressure' in training_df.columns:
    print("✓ receiver_pressure column found in training_df")
    print(f"\nColumn statistics:")
    print(training_df['receiver_pressure'].describe())
    
    print(f"\n" + "="*80)
    print("FRAME-BY-FRAME VARIATION (Sample Play)")
    print("="*80)
    
    # Show receiver_pressure varies by frame for a sample play
    sample_play = training_df.groupby(['game_id', 'play_id']).first().reset_index().iloc[0]
    sample_frames = training_df[
        (training_df['game_id'] == sample_play['game_id']) &
        (training_df['play_id'] == sample_play['play_id'])
    ].sort_values('continuous_frame')
    
    print(f"\nGame {sample_play['game_id']}, Play {sample_play['play_id']}:")
    print(f"Receiver Pressure by Frame:")
    display_cols = ['continuous_frame', 'frame_type', 'receiver_pressure', 'sep_nearest', 'num_def_within_3']
    print(sample_frames[display_cols].to_string(index=False))
    
    print(f"\n" + "="*80)
    print("RECEIVER PRESSURE DISTRIBUTION")
    print("="*80)
    print(f"Mean receiver pressure: {training_df['receiver_pressure'].mean():.3f}")
    print(f"Std receiver pressure: {training_df['receiver_pressure'].std():.3f}")
    print(f"Min receiver pressure: {training_df['receiver_pressure'].min():.3f}")
    print(f"Max receiver pressure: {training_df['receiver_pressure'].max():.3f}")
    
    # Show how receiver_pressure changes within a play
    print(f"\n" + "="*80)
    print("RECEIVER PRESSURE VARIATION WITHIN PLAYS")
    print("="*80)
    pressure_variation = training_df.groupby(['game_id', 'play_id'])['receiver_pressure'].agg(['min', 'max', 'std'])
    print(f"Average pressure range per play: {(pressure_variation['max'] - pressure_variation['min']).mean():.3f}")
    print(f"Average pressure std per play: {pressure_variation['std'].mean():.3f}")
    
    print(f"\n✓ Receiver pressure is calculated and varies frame-by-frame!")
    print(f"\nNote: receiver_pressure (0-1 scale)")
    print(f"  - 0.0 = High defender pressure (low receiver advantage)")
    print(f"  - 1.0 = High receiver advantage (low defender pressure)")
    print(f"  - Accounts for ALL defenders within 6 yards")
    print(f"  - Uses forward-looking center (2 yards in front of receiver, towards ball)")
    
else:
    print("⚠ WARNING: receiver_pressure column NOT FOUND")
    print("\nThe receiver_pressure column should be automatically included when you call:")
    print("  scorer.calculate_frame_dominance(game_id, play_id, target_nfl_id)")
    print("\nTo add it, you need to:")
    print("1. Make sure you're using the updated route_dominance_scoring.py")
    print("2. Regenerate your training_df by running:")
    print("   training_df, errors = create_training_dataframe(...)")
    print("\nThe receiver_pressure is calculated using:")
    print("- Receiver influence PDF (6-yard radius, centered 2 yards in front)")
    print("- Defender pressure PDFs (4-yard radius, all defenders within 6 yards)")
    print("- Multivariate normal distributions")
    print("- Values range from 0.0 (high defender pressure) to 1.0 (high receiver advantage)")



RECEIVER PRESSURE VERIFICATION

Please run the cell that creates training_df first:
  training_df, errors = create_training_dataframe(...)

The receiver_pressure column will be automatically included
when you regenerate training_df with the updated route_dominance_scoring.py


In [16]:
# Initialize the Route Dominance Scorer
print("Initializing Route Dominance Scorer...")
scorer = RouteDominanceScorer(input_df, output_df, supp_df)

print("✓ Route Dominance Scorer initialized")
print(f"  Combined frames: {len(scorer.all_frames_df):,} rows")
print(f"  Unique plays: {scorer.all_frames_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")


Initializing Route Dominance Scorer...
✓ Route Dominance Scorer initialized
  Combined frames: 1,086,600 rows
  Unique plays: 2,838


## 5. Process Plays and Create Training DataFrame

Process all plays to calculate frame-by-frame dominance metrics and create the training DataFrame.


In [17]:
# Function to create training DataFrame
def create_training_dataframe(input_df, output_df, supp_df, scorer, weeks=None, max_plays=None):
    """
    Create a training-ready DataFrame with all route dominance metrics
    
    Args:
        input_df: Input DataFrame (pre-throw data)
        output_df: Output DataFrame (post-throw data)
        supp_df: Supplementary DataFrame (play context)
        scorer: Initialized RouteDominanceScorer
        weeks: List of weeks to process (None = all weeks)
        max_plays: Maximum number of plays to process (None = all plays)
    
    Returns:
        DataFrame with all metrics for model training
    """
    print("="*80)
    print("PROCESSING PLAYS")
    print("="*80)
    
    # Get all unique plays with targeted receivers
    targeted_plays = input_df[
        input_df["player_role"] == "Targeted Receiver"
    ][["game_id", "play_id", "nfl_id", "player_name", "week"]].drop_duplicates()
    
    if weeks is not None:
        targeted_plays = targeted_plays[targeted_plays["week"].isin(weeks)]
    
    if max_plays is not None:
        targeted_plays = targeted_plays.head(max_plays)
    
    targeted_plays = targeted_plays[["game_id", "play_id", "nfl_id", "player_name"]]
    
    print(f"Processing {len(targeted_plays)} plays...")
    if len(targeted_plays) == 0:
        print("⚠ WARNING: No plays found!")
        return pd.DataFrame(), []
    
    all_metrics = []
    errors = []
    
    for idx, row in tqdm(targeted_plays.iterrows(), total=len(targeted_plays), desc="Processing"):
        game_id = row["game_id"]
        play_id = row["play_id"]
        target_nfl_id = row["nfl_id"]
        target_name = row["player_name"]
        
        try:
            # Calculate frame-by-frame dominance
            frame_metrics = scorer.calculate_frame_dominance(game_id, play_id, target_nfl_id)
            
            # Get play context from supplementary data
            supp_row = supp_df[
                (supp_df["game_id"] == game_id) &
                (supp_df["play_id"] == play_id)
            ]
            
            if not supp_row.empty:
                pass_result = supp_row.iloc[0].get("pass_result", "UNKNOWN")
                is_complete = 1 if pass_result == "C" else 0
                offense_formation = supp_row.iloc[0].get("offense_formation", "UNKNOWN")
                receiver_alignment = supp_row.iloc[0].get("receiver_alignment", "UNKNOWN")
                coverage_type = supp_row.iloc[0].get("team_coverage_type", "UNKNOWN")
                down = supp_row.iloc[0].get("down", np.nan)
                yards_to_go = supp_row.iloc[0].get("yards_to_go", np.nan)
                pass_length = supp_row.iloc[0].get("pass_length", np.nan)
                route = supp_row.iloc[0].get("route_of_targeted_receiver", "UNKNOWN")
            else:
                is_complete = np.nan
                offense_formation = "UNKNOWN"
                receiver_alignment = "UNKNOWN"
                coverage_type = "UNKNOWN"
                down = np.nan
                yards_to_go = np.nan
                pass_length = np.nan
                route = "UNKNOWN"
            
            # Add play-level context to each frame
            frame_metrics["target_name"] = target_name
            frame_metrics["is_complete"] = is_complete
            frame_metrics["offense_formation"] = offense_formation
            frame_metrics["receiver_alignment"] = receiver_alignment
            frame_metrics["coverage_type"] = coverage_type
            frame_metrics["down"] = down
            frame_metrics["yards_to_go"] = yards_to_go
            frame_metrics["pass_length"] = pass_length
            
            # Add continuous frame number (starts at 1, varies by play)
            frame_metrics["continuous_frame"] = range(1, len(frame_metrics) + 1)
            
            # Add throw_status column
            frame_metrics["throw_status"] = frame_metrics["frame_type"].map({
                "input": "pre_throw",
                "output": "after_throw"
            })
            
            # Add route break features using scorer's built-in logic
            # (is_break_frame and frames_since_break are already computed frame-by-frame)
            if "is_break_frame" in frame_metrics.columns:
                has_break = frame_metrics["is_break_frame"].max() == 1
                frame_metrics["has_break"] = has_break
                if has_break:
                    # Use the first break frame_id reported by the scorer
                    break_rows = frame_metrics[frame_metrics["is_break_frame"] == 1]
                    play_break_frame = break_rows.iloc[0]["frame_id"]
                else:
                    play_break_frame = 0
                frame_metrics["play_break_frame"] = play_break_frame
                # frames_since_break already exists; keep as provided by scorer
                frame_metrics["frames_until_break"] = np.nan  # optional, not used in new logic
            else:
                frame_metrics["has_break"] = False
                frame_metrics["play_break_frame"] = 0
                frame_metrics["is_break_frame"] = 0
                frame_metrics["frames_since_break"] = np.nan
                frame_metrics["frames_until_break"] = np.nan
            
            # Get nearest defender coordinates for each frame
            nearest_def_x = []
            nearest_def_y = []
            receiver_x = []
            receiver_y = []
            
            for _, frame_row in frame_metrics.iterrows():
                frame_id = frame_row["frame_id"]
                frame_type = frame_row["frame_type"]
                
                play_frames = scorer.all_frames_df[
                    (scorer.all_frames_df["game_id"] == game_id) &
                    (scorer.all_frames_df["play_id"] == play_id) &
                    (scorer.all_frames_df["frame_id"] == frame_id) &
                    (scorer.all_frames_df["frame_type"] == frame_type)
                ]
                
                defenders = play_frames[play_frames["player_side"] == "Defense"]
                
                if not defenders.empty and not np.isnan(frame_row["sep_nearest"]) and frame_row["sep_nearest"] < np.inf:
                    receiver_frame = play_frames[play_frames["nfl_id"] == target_nfl_id]
                    if not receiver_frame.empty:
                        rec_x = receiver_frame.iloc[0]["x_std"]
                        rec_y = receiver_frame.iloc[0]["y_std"]
                        
                        def_dists = np.sqrt(
                            (defenders["x_std"] - rec_x)**2 +
                            (defenders["y_std"] - rec_y)**2
                        )
                        nearest_idx = def_dists.idxmin()
                        nearest_def = defenders.loc[nearest_idx]
                        nearest_def_x.append(nearest_def["x_std"])
                        nearest_def_y.append(nearest_def["y_std"])
                        receiver_x.append(rec_x)
                        receiver_y.append(rec_y)
                    else:
                        nearest_def_x.append(np.nan)
                        nearest_def_y.append(np.nan)
                        receiver_x.append(np.nan)
                        receiver_y.append(np.nan)
                else:
                    nearest_def_x.append(np.nan)
                    nearest_def_y.append(np.nan)
                    receiver_x.append(np.nan)
                    receiver_y.append(np.nan)
            
            frame_metrics["nearest_defender_x"] = nearest_def_x
            frame_metrics["nearest_defender_y"] = nearest_def_y
            frame_metrics["receiver_x"] = receiver_x
            frame_metrics["receiver_y"] = receiver_y
            
            # Calculate absolute horizontal and vertical difference to nearest defender
            frame_metrics["abs_dx_to_nearest_defender"] = (frame_metrics["receiver_x"] - frame_metrics["nearest_defender_x"]).abs()
            frame_metrics["abs_dy_to_nearest_defender"] = (frame_metrics["receiver_y"] - frame_metrics["nearest_defender_y"]).abs()
            
            all_metrics.append(frame_metrics)
            
        except Exception as e:
            error_msg = f"{type(e).__name__}: {str(e)}"
            errors.append((game_id, play_id, error_msg))
            # Print first few errors for debugging
            if len(errors) <= 5:
                print(f"  ⚠ Error processing Game {game_id}, Play {play_id}: {error_msg}")
            continue
    
    if not all_metrics:
        print("\n⚠ WARNING: No metrics generated!")
        print(f"  Total plays processed: {len(targeted_plays)}")
        print(f"  Total errors: {len(errors)}")
        if errors:
            print("\n  First 5 errors:")
            for i, (gid, pid, err) in enumerate(errors[:5], 1):
                print(f"    {i}. Game {gid}, Play {pid}: {err}")
        return pd.DataFrame(), errors
    
    training_df = pd.concat(all_metrics, ignore_index=True)
    
    print(f"\n✓ Processing complete!")
    print(f"  Total rows: {len(training_df):,}")
    print(f"  Total plays: {training_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")
    print(f"  Errors: {len(errors)}")
    
    return training_df, errors

print("✓ Function defined")


✓ Function defined


In [18]:
# Create training DataFrame
training_df, errors = create_training_dataframe(
    input_df, output_df, supp_df, scorer, 
    weeks=WEEKS_TO_PROCESS,
    max_plays=None  # Set to a number like 100 for faster testing
)

# Check if training DataFrame was created successfully
if training_df.empty:
    print("\n" + "="*80)
    print("ERROR: Training DataFrame is empty!")
    print("="*80)
    print(f"Number of errors: {len(errors)}")
    if errors:
        print("\nSample errors:")
        for i, (gid, pid, err) in enumerate(errors[:5]):
            print(f"  {i+1}. Game {gid}, Play {pid}: {err}")
    print("\nPlease check:")
    print("  1. Are there targeted receivers in the input data?")
    print("  2. Are the game_id and play_id values matching between datasets?")
    print("  3. Check the error messages above for specific issues")
else:
    # Filter out throwaways (plays where ball was nowhere near the target receiver)
    print("\n" + "="*80)
    print("FILTERING THROWAWAYS")
    print("="*80)
    
    if "game_id" not in training_df.columns or "play_id" not in training_df.columns:
        print("ERROR: training_df missing required columns 'game_id' or 'play_id'")
        print(f"Available columns: {list(training_df.columns)}")
    elif "dist_to_ball" not in training_df.columns:
        print("ERROR: training_df missing 'dist_to_ball' column")
        print(f"Available columns: {list(training_df.columns)}")
    else:
        last_frames = training_df.groupby(["game_id", "play_id"]).last().reset_index()
        throwaway_mask = last_frames["dist_to_ball"] > THROWAWAY_DIST
        throwaway_plays = last_frames[throwaway_mask][["game_id", "play_id"]]
        
        print(f"Found {len(throwaway_plays)} plays with ball > {THROWAWAY_DIST} yards from receiver")
        print(f"Percentage: {100 * len(throwaway_plays) / len(last_frames):.1f}%")
        
        if len(throwaway_plays) > 0:
            training_df = training_df[
                ~training_df.set_index(["game_id", "play_id"]).index.isin(
                    throwaway_plays.set_index(["game_id", "play_id"]).index
                )
            ].reset_index(drop=True)
            print(f"\n✓ Filtered out {len(throwaway_plays)} throwaway plays")
            print(f"  Remaining plays: {training_df[['game_id', 'play_id']].drop_duplicates().shape[0]:,}")
            print(f"  Remaining frames: {len(training_df):,}")
        else:
            print("\n✓ No throwaway plays found")


PROCESSING PLAYS
Processing 2838 plays...


Processing:   2%|▏         | 55/2838 [03:23<2:51:37,  3.70s/it]


KeyboardInterrupt: 

In [None]:

# Show the first 48 rows of training_df in a display format
import pandas as pd

# Make sure all columns are visible
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)

display(training_df.head(46))




## 6. Training DataFrame Summary

Examine the structure and contents of the created training DataFrame.


In [None]:
# Display training DataFrame summary
if 'training_df' not in globals() or training_df.empty:
    print("="*80)
    print("ERROR: training_df is not defined or is empty")
    print("="*80)
    print("Please run the cell that creates the training DataFrame first:")
    print("  - Cell that calls create_training_dataframe()")
    print("  - Make sure it completes successfully")
else:
    print("="*80)
    print("TRAINING DATAFRAME SUMMARY")
    print("="*80)
    print(f"Shape: {training_df.shape}")
    print(f"\nColumns ({len(training_df.columns)} total):")
    for i, col in enumerate(training_df.columns, 1):
        print(f"  {i:2d}. {col}")

    print(f"\n" + "="*80)
    print("SAMPLE DATA")
    print("="*80)
    display_cols = [
        "game_id", "play_id", "continuous_frame", "frame_type", "throw_status",
        "is_complete", "sep_nearest", "receiver_speed",
        "leverage_angle", "nearest_defender_x", "nearest_defender_y"
    ]
    # Add receiver_pressure / field_control if they exist
    if 'receiver_pressure' in training_df.columns:
        display_cols.append('receiver_pressure')
    if 'field_control' in training_df.columns:
        display_cols.append('field_control')
    
    print(training_df[display_cols].head(15))

    # Frame count verification
    print("\n" + "="*80)
    print("FRAME COUNT VERIFICATION")
    print("="*80)
    frame_counts = training_df.groupby(["game_id", "play_id"]).size()
    print(f"Average frames per play: {frame_counts.mean():.1f}")
    print(f"Min frames: {frame_counts.min()}, Max frames: {frame_counts.max()}")

    # Throw status distribution
    print(f"\n" + "="*80)
    print("THROW STATUS DISTRIBUTION")
    print("="*80)
    print(training_df["throw_status"].value_counts())

    # Completion statistics
    print(f"\n" + "="*80)
    print("COMPLETION STATISTICS")
    print("="*80)
    print(f"Completion rate: {training_df['is_complete'].mean():.2%}")
    print(f"Complete plays: {training_df['is_complete'].sum():,}")
    print(f"Incomplete plays: {(~training_df['is_complete'].astype(bool)).sum():,}")

    # Metric statistics
    print(f"\n" + "="*80)
    print("METRIC STATISTICS")
    print("="*80)
    metric_cols = [
        "sep_nearest", "receiver_speed", "receiver_accel",
        "leverage_angle", "num_def_within_3",
    ]
    # Add optional metrics only if they exist
    if 'receiver_pressure' in training_df.columns:
        metric_cols.append('receiver_pressure')
    if 'field_control' in training_df.columns:
        metric_cols.append('field_control')
    if 'time_advantage' in training_df.columns:
        metric_cols.append('time_advantage')
    if 'route_dominance_weighted' in training_df.columns:
        metric_cols.append('route_dominance_weighted')

    # Keep only columns that actually exist to avoid KeyError
    metric_cols = [col for col in metric_cols if col in training_df.columns]

    print(training_df[metric_cols].describe())


In [None]:
print(training_df.head())


In [None]:
# Make a new DataFrame called x_train with the specified columns
x_train_cols = [
    "game_id",                  # 1
    "play_id",                  # 2
    "receiver_speed",           # 13
    "receiver_accel",           # 14
    "leverage_angle",           # 16
    "relative_velocity_angle",  # 17
    "pressure_score",           # 22
    "is_press",                 # 24
    "shade_encoded",            # 25
    "field_control",            # 27
    "is_break_frame",           # 28
    "frames_since_break",       # 29
    "continuous_frame",         # 40
    "is_complete",              # catch / no-catch label
]

# Keep only columns that exist in training_df to avoid KeyError
x_train_cols_existing = [col for col in x_train_cols if col in training_df.columns]

x_train = training_df[x_train_cols_existing].copy()



In [None]:
from IPython.display import display
display(x_train.head())


## 7. Exploratory Data Analysis and Visualizations

Visualize key relationships and distributions in the data.


# Plot 1: Distance to ball distribution - caught vs not caught
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharex=True, sharey=True)

caught_mask = route_level_df["is_complete"] == 1
not_caught_mask = route_level_df["is_complete"] == 0

sns.histplot(
    route_level_df.loc[caught_mask, "dist_to_ball"],
    bins=50,
    stat="density",
    ax=axes[0],
    color="green",
    alpha=0.7
)
axes[0].set_xlim(0, 40)
axes[0].set_title("Caught passes: receiver-to-ball distance")
axes[0].set_xlabel("Distance (yards)")
axes[0].axvline(route_level_df.loc[caught_mask, "dist_to_ball"].mean(), 
                color="darkgreen", linestyle="--", 
                label=f"Mean: {route_level_df.loc[caught_mask, 'dist_to_ball'].mean():.2f} yds")
axes[0].legend()

sns.histplot(
    route_level_df.loc[not_caught_mask, "dist_to_ball"],
    bins=50,
    stat="density",
    ax=axes[1],
    color="red",
    alpha=0.7
)
axes[1].set_xlim(0, 40)
axes[1].set_title("Not caught passes: receiver-to-ball distance")
axes[1].set_xlabel("Distance (yards)")
axes[1].axvline(route_level_df.loc[not_caught_mask, "dist_to_ball"].mean(), 
                color="darkred", linestyle="--", 
                label=f"Mean: {route_level_df.loc[not_caught_mask, 'dist_to_ball'].mean():.2f} yds")
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Mean distance (caught): {route_level_df.loc[caught_mask, 'dist_to_ball'].mean():.2f} yards")
print(f"Mean distance (not caught): {route_level_df.loc[not_caught_mask, 'dist_to_ball'].mean():.2f} yards")


# Plot 4: Catch rate by route type
min_samples = 10
route_stats = route_level_df.groupby("route").agg({
    "is_complete": ["mean", "count"]
}).reset_index()
route_stats.columns = ["route", "mean", "count"]
route_stats = route_stats[route_stats["count"] >= min_samples].sort_values("mean", ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=route_stats.head(20), x="mean", y="route", palette="viridis")
plt.title(f"Catch rate by route (>= {min_samples} targets)")
plt.xlabel("Catch rate")
plt.ylabel("Route type")
plt.xlim(0, 1)
plt.tight_layout()
plt.show()

print(f"\nTop routes by catch rate:")
print(route_stats.head(10))


In [None]:
# Compare completed vs incomplete plays
print("="*80)
print("COMPLETED VS INCOMPLETE COMPARISON")
print("="*80)

# Build aggregation only for columns that actually exist
agg_dict = {}

# New control metric: field_control (spatial ownership)
if "field_control" in training_df.columns:
    agg_dict["field_control"] = "mean"
if "route_dominance_weighted" in training_df.columns:
    agg_dict["route_dominance_weighted"] = "first"

if "sep_nearest" in training_df.columns:
    agg_dict["sep_nearest"] = "mean"
if "receiver_speed" in training_df.columns:
    agg_dict["receiver_speed"] = "mean"
if "leverage_angle" in training_df.columns:
    agg_dict["leverage_angle"] = "mean"
if "time_advantage" in training_df.columns:
    agg_dict["time_advantage"] = "mean"

if not agg_dict:
    print("No metric columns found to compare.")
else:
    metric_cols = list(agg_dict.keys())

    route_level = training_df.groupby(["game_id", "play_id", "is_complete"]).agg(agg_dict).reset_index()

    complete_stats = route_level[route_level["is_complete"] == 1][metric_cols].mean()
    incomplete_stats = route_level[route_level["is_complete"] == 0][metric_cols].mean()

    comparison = pd.DataFrame({
        "Completed": complete_stats,
        "Incomplete": incomplete_stats,
        "Difference": complete_stats - incomplete_stats
    })

    print(comparison)


## 8. Save Training DataFrame

Save the processed DataFrame to CSV for model training.


In [None]:
import pandas as pd

# --- Configure the play you want to visualize ---
game_id = 2023090700   # change this
play_id = 101     # change this
fps = 5                # frames per second for the GIF

# --- Load data (assumes notebook is in 12-5 NFL databowl working/) ---
DATA_DIR = "/kaggle/input/nfl-data"
input_df = pd.read_csv(f'{DATA_DIR}/input_2023_w01.csv')   # adjust week/file as needed
output_df = pd.read_csv(f'{DATA_DIR}/output_2023_w01.csv')
supp_df = pd.read_csv(f'{DATA_DIR}/supplementary_data.csv')
training_df = pd.read_csv('outputs/route_dominance_training_data.csv')

# --- Initialize scorer (contains all_frames_df with all players) ---
scorer = RouteDominanceScorer(input_df, output_df, supp_df)

# --- Generate GIF for the selected play ---
gif_path = create_gif_for_play(
    training_df,
    game_id=game_id,
    play_id=play_id,
    fps=fps,
    scorer=scorer  # pass scorer to include ALL players
)

print("GIF created:", gif_path)

In [None]:
import importlib

importlib.reload(fcv)


viz = FieldControlVisualizer(scorer)

game_id = 2023090700   # your chosen play
play_id = 1001

advanced_gif_path = viz.create_advanced_gif(
    training_df,
    game_id=game_id,
    play_id=play_id,
    output_path=f"outputs/advanced_field_control_game{game_id}_play{play_id}_v2.gif",
)
advanced_gif_path

In [None]:
from IPython.display import Image, display

display(Image(advanced_gif_path))

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

# Try to import XGBoost, fall back to LogisticRegression if not available
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except (ImportError, Exception):
    XGBOOST_AVAILABLE = False
    from sklearn.linear_model import LogisticRegression
    print("Warning: XGBoost not available. Will use LogisticRegression instead.")

# ==============================================================================
# 1. CONFIGURATION & HELPERS
# ==============================================================================
THROWAWAY_DIST = 8.0  # Yards (Filter plays where ball lands > 8 yds from target)
EXCLUDE_ROUTE_KEYWORDS = ["screen", "angle"] # Filter schemed touches

def load_all_data(path_to_data_folder):
    """Loads Input, Output, and Supplementary data."""
    print("--- 1. LOADING DATA ---")
    
    # A. Supplementary
    supp_path = os.path.join(path_to_data_folder, "supplementary_data.csv")
    if not os.path.exists(supp_path):
        raise FileNotFoundError(f"Could not find {supp_path}")
    supp_df = pd.read_csv(supp_path)

    # B. Input (Pre-Throw)
    input_files = glob.glob(os.path.join(path_to_data_folder, "input_2023_w*.csv"))
    if not input_files:
        raise FileNotFoundError("No input files found!")
    
    print(f"   -> Loading {len(input_files)} INPUT files...")
    input_list = [pd.read_csv(f) for f in input_files]
    input_df = pd.concat(input_list, ignore_index=True)

    # C. Output (Post-Throw) - Needed for Separation at Catch
    output_files = glob.glob(os.path.join(path_to_data_folder, "output_2023_w*.csv"))
    if not output_files:
        raise FileNotFoundError("No output files found!")
        
    print(f"   -> Loading {len(output_files)} OUTPUT files...")
    output_list = [pd.read_csv(f) for f in output_files]
    output_df = pd.concat(output_list, ignore_index=True)
    
    return input_df, output_df, supp_df

def preprocess_filters(input_df, output_df, supp_df):
    """Filters out Screens and Throwaways."""
    print("--- 2. FILTERING DATA ---")
    
    # A. Route Filter
    if 'route_of_targeted_receiver' in supp_df.columns:
        route_lower = supp_df['route_of_targeted_receiver'].fillna("").str.lower()
        pattern = "|".join(EXCLUDE_ROUTE_KEYWORDS)
        valid_route_mask = ~route_lower.str.contains(pattern)
        valid_route_plays = supp_df.loc[valid_route_mask, ['game_id', 'play_id']]
        print(f"   -> Routes: Removed {(~valid_route_mask).sum()} schemed plays.")
    else:
        valid_route_plays = supp_df[['game_id', 'play_id']]

    # B. Throwaway Filter (> 8 Yards)
    # Get last frame for each play (when ball arrives)
    last_frames_idx = input_df.groupby(['game_id', 'play_id'])['frame_id'].transform(max) == input_df['frame_id']
    last_frames = input_df[last_frames_idx].copy()
    
    # Get targeted receiver position at last frame
    targets_last = last_frames[last_frames['player_role'] == 'Targeted Receiver'][
        ['game_id', 'play_id', 'x', 'y', 'ball_land_x', 'ball_land_y']
    ].copy()
    
    if not targets_last.empty and 'ball_land_x' in targets_last.columns and 'ball_land_y' in targets_last.columns:
        # Calculate distance from ball landing to receiver
        targets_last['dist_to_ball'] = np.sqrt(
            (targets_last['x'] - targets_last['ball_land_x'])**2 + 
            (targets_last['y'] - targets_last['ball_land_y'])**2
        )
        
        # Identify valid throw plays (not throwaways)
        valid_throw_plays = targets_last[targets_last['dist_to_ball'] <= THROWAWAY_DIST][
            ['game_id', 'play_id']
        ].drop_duplicates()
        
        print(f"   -> Throwaways: Removed {len(targets_last) - len(valid_throw_plays)} plays.")
    else:
        print(f"   -> Warning: Cannot calculate throwaway distance (missing columns), skipping filter")
        valid_throw_plays = supp_df[['game_id', 'play_id']].drop_duplicates()

    # C. Combine & Filter
    clean_plays = valid_route_plays.merge(valid_throw_plays, on=['game_id', 'play_id'])
    
    clean_input = input_df.merge(clean_plays, on=['game_id', 'play_id'], how='inner')
    clean_output = output_df.merge(clean_plays, on=['game_id', 'play_id'], how='inner')
    clean_supp = supp_df.merge(clean_plays, on=['game_id', 'play_id'], how='inner')
    
    print(f"   -> Final Training Set: {len(clean_plays)} plays.")
    return clean_input, clean_output, clean_supp

# ==============================================================================
# 2. FEATURE ENGINEERING (THE ECP BUILDER)
# ==============================================================================
def build_ecp_dataset(input_df, output_df, supp_df):
    """Builds ECP features including Press, Shade, and Separation at Catch."""
    print("--- 3. BUILDING ECP FEATURES ---")

    # A. Filter Supplementary to Valid Pass Results (C/I/IN)
    valid_results = ['C', 'I', 'IN']
    supp_df = supp_df[supp_df['pass_result'].isin(valid_results)].copy()
    
    supp_cols = [
        'game_id', 'play_id', 'yards_to_go', 'down', 
        'pre_snap_home_score', 'pre_snap_visitor_score', 'defenders_in_the_box', 
        'team_coverage_man_zone', 'offense_formation', 'receiver_alignment', 
        'dropback_type', 'pass_result'
    ]
    ecp_base = supp_df[[c for c in supp_cols if c in supp_df.columns]]

    # B. Pre-Snap Context (Frame 1) - Press & Shade
    print("   -> Calculating Pre-Snap Leverage...")
    snap_frames = input_df[input_df['frame_id'] == 1].copy()
    
    targets = snap_frames[snap_frames['player_role'] == 'Targeted Receiver']
    defenders = snap_frames[snap_frames['player_role'] == 'Defensive Coverage']
    
    # Find nearest defender
    merged_snap = targets[['game_id', 'play_id', 'x', 'y']].merge(
        defenders[['game_id', 'play_id', 'x', 'y']], 
        on=['game_id', 'play_id'], suffixes=('_rec', '_def')
    )
    merged_snap['dist'] = np.sqrt((merged_snap['x_rec'] - merged_snap['x_def'])**2 + (merged_snap['y_rec'] - merged_snap['y_def'])**2)
    nearest_snap = merged_snap.sort_values('dist').groupby(['game_id', 'play_id']).head(1)
    
    # Calculate Metrics
    nearest_snap['is_press'] = (nearest_snap['dist'] < 5.0).astype(int)
    nearest_snap['shade_score'] = abs(nearest_snap['y_rec'] - 26.65) - abs(nearest_snap['y_def'] - 26.65)
    nearest_snap['shade_encoded'] = np.select(
        [nearest_snap['shade_score'] > 0.5, nearest_snap['shade_score'] < -0.5], 
        [1, -1], default=0
    )
    presnap_feats = nearest_snap[['game_id', 'play_id', 'is_press', 'shade_encoded']]

    # C. Throw Dynamics (Input DF Last Frame) - QB Speed
    print("   -> Calculating Throw Dynamics...")
    last_frames_idx = input_df.groupby(['game_id', 'play_id'])['frame_id'].transform(max) == input_df['frame_id']
    throw_frames = input_df[last_frames_idx].copy()
    
    qb_data = throw_frames[throw_frames['player_role'] == 'Passer'][['game_id', 'play_id', 'x', 'y', 's', 'a', 'ball_land_x', 'ball_land_y']]
    qb_data = qb_data.rename(columns={'x': 'passer_x', 'y': 'passer_y', 's': 'passer_speed', 'a': 'passer_acceleration'})

    # D. Separation AND Pressure at Catch (Output DF Last Frame)
    print("   -> Calculating Separation & Pressure at Catch...")
    
    # 1. Approximate velocities at all output frames using position changes
    out_df = output_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).copy()
    out_df['vx'] = out_df.groupby(['game_id', 'play_id', 'nfl_id'])['x'].diff().fillna(0.0)
    out_df['vy'] = out_df.groupby(['game_id', 'play_id', 'nfl_id'])['y'].diff().fillna(0.0)
    
    # 2. Get Catch Moment Data (Frame where ball arrives)
    max_frames = out_df.groupby(['game_id', 'play_id'])['frame_id'].max().reset_index()
    outcome_rows = out_df.merge(max_frames, on=['game_id', 'play_id', 'frame_id'])
    
    # 3. Get TARGET Data (x, y, vx, vy)
    # Note: We need velocity components for Pressure Score
    snap_frames = input_df[input_df['frame_id'] == 1].copy()
    target_info = snap_frames[snap_frames['player_role'] == 'Targeted Receiver'][
        ['game_id', 'play_id', 'nfl_id']
    ].rename(columns={'nfl_id': 'target_nfl_id'})
    
    out_targets = outcome_rows.merge(target_info, on=['game_id', 'play_id'])
    out_targets = out_targets[out_targets['nfl_id'] == out_targets['target_nfl_id']]
    out_targets = out_targets[['game_id', 'play_id', 'x', 'y', 'vx', 'vy']].rename(
        columns={'x': 'rx', 'y': 'ry', 'vx': 'rvx', 'vy': 'rvy'}
    )
    
    # 4. Get DEFENDER Data (x, y, vx, vy)
    def_ids = snap_frames[snap_frames['player_role'] == 'Defensive Coverage'][
        ['game_id', 'play_id', 'nfl_id']
    ]
    
    out_defs = outcome_rows.merge(def_ids, on=['game_id', 'play_id', 'nfl_id'])
    out_defs = out_defs[['game_id', 'play_id', 'x', 'y', 'vx', 'vy']].rename(
        columns={'x': 'dx', 'y': 'dy', 'vx': 'dvx', 'vy': 'dvy'}
    )
    
    # 5. Merge to pair Receiver with Every Defender
    # (This creates a row for every Rec-Def pair at the catch moment)
    pairs = out_targets.merge(out_defs, on=['game_id', 'play_id'], how='inner')
    
    # 6. Calculate Physics Vectors (Vectorized = FAST)
    # A. Distance (Space)
    pairs['dist'] = np.sqrt((pairs['rx'] - pairs['dx'])**2 + (pairs['ry'] - pairs['dy'])**2)
    
    # B. Relative Velocity (How fast are they moving relative to each other?)
    pairs['rel_vx'] = pairs['rvx'] - pairs['dvx']
    pairs['rel_vy'] = pairs['rvy'] - pairs['dvy']
    
    # Unit Vector connecting them (Direction of the gap)
    # Use a safe distance to avoid division by zero
    dist_safe = pairs['dist'].replace(0, 0.01)
    pairs['u_x'] = (pairs['rx'] - pairs['dx']) / dist_safe
    pairs['u_y'] = (pairs['ry'] - pairs['dy']) / dist_safe
    
    # Closing Speed = Relative Velocity projected onto the Gap Vector
    # Positive = Closing In, Negative = Moving Away
    pairs['closing_speed'] = -(pairs['rel_vx'] * pairs['u_x'] + pairs['rel_vy'] * pairs['u_y'])
    
    # C. Calculate Time to Collision
    # If moving away (closing_speed < 0), time is infinite (99.0)
    pairs['time_to_collision'] = np.where(
        pairs['closing_speed'] <= 0.1,
        99.0,  # Not closing / Moving away
        dist_safe / pairs['closing_speed']
    )
    
    # D. Convert to Pressure Score (0-100)
    # 0.0s -> 100, 1.0s -> 36, 3.0s -> 5
    pairs['pressure_score'] = 100 * np.exp(-1.0 * pairs['time_to_collision'])
    
    # 6. Aggregation: Find the "Most Dangerous" Defender for each play
    # We want the Minimum Distance (Sep) and Maximum Pressure (Threat)
    play_metrics = pairs.groupby(['game_id', 'play_id']).agg(
        separation_at_catch=('dist', 'min'),
        pressure_at_catch=('pressure_score', 'max')  # Who is the biggest threat?
    ).reset_index()
    
    # Use this dataframe to merge back into ecp_df
    separation_df = play_metrics

    # E. Merge All
    ecp_df = ecp_base.merge(presnap_feats, on=['game_id', 'play_id'], how='inner')
    ecp_df = ecp_df.merge(qb_data, on=['game_id', 'play_id'], how='inner')
    ecp_df = ecp_df.merge(separation_df, on=['game_id', 'play_id'], how='inner')

    # F. Final Engineer & Encode
    ecp_df['air_distance'] = np.sqrt((ecp_df['ball_land_x'] - ecp_df['passer_x'])**2 + (ecp_df['ball_land_y'] - ecp_df['passer_y'])**2)
    ecp_df['target_dist_to_sideline'] = ecp_df.apply(lambda r: min(r['ball_land_y'], 53.3 - r['ball_land_y']), axis=1)
    if 'pre_snap_home_score' in ecp_df.columns:
        ecp_df['score_diff'] = abs(ecp_df['pre_snap_home_score'] - ecp_df['pre_snap_visitor_score'])
    else:
        ecp_df['score_diff'] = 0

    cat_cols = ['team_coverage_man_zone', 'offense_formation', 'receiver_alignment', 'dropback_type']
    valid_cats = [c for c in cat_cols if c in ecp_df.columns]
    ecp_encoded = pd.get_dummies(ecp_df, columns=valid_cats, drop_first=True)

    numeric_features = [
        'air_distance', 'target_dist_to_sideline', 'passer_speed', 'passer_acceleration', 
        'yards_to_go', 'defenders_in_the_box', 'score_diff', 
        'is_press', 'shade_encoded', 'separation_at_catch', 'pressure_at_catch'
    ]
    dummy_cols = [c for c in ecp_encoded.columns if any(stem in c for stem in valid_cats)]
    final_features = numeric_features + dummy_cols
    
    X = ecp_encoded[final_features].fillna(0)
    y = ecp_encoded['pass_result'].apply(lambda x: 1 if x == 'C' else 0)
    
    # Preserve game_id and play_id for output
    play_ids = ecp_encoded[['game_id', 'play_id']].copy()
    
    print(f"--- READY. Rows: {len(X)} | Features: {len(final_features)} ---")
    return X, y, final_features, play_ids

def build_ecp_predictions_dataframe(path_to_data_folder, test_size=0.2, random_state=42):
    """
    Convenience helper for notebooks:
    
    Trains the ECP model using the full dataset in `path_to_data_folder`
    (after applying route/throwaway filters) and returns a DataFrame:
    
        game_id, play_id, ecp
    
    where `ecp` is the model's completion probability for each play.
    """
    # 1. Load raw tracking + supplementary data
    raw_input, raw_output, raw_supp = load_all_data(path_to_data_folder)
    
    # 2. Apply route + throwaway filters
    cl_input, cl_output, cl_supp = preprocess_filters(raw_input, raw_output, raw_supp)
    
    # 3. Build feature matrix and labels
    X, y, feature_names, play_ids = build_ecp_dataset(cl_input, cl_output, cl_supp)
    
    # 4. Train/validation split (for model training & basic metrics)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # 5. Configure model
    if XGBOOST_AVAILABLE:
        ecp_model = XGBClassifier(
            n_estimators=100,
            learning_rate=0.05,
            max_depth=4,
            eval_metric="logloss",
        )
        print("Using XGBoost classifier...")
    else:
        ecp_model = LogisticRegression(max_iter=1000, random_state=random_state)
        print("Using LogisticRegression classifier...")
    
    # 6. Train model
    print("--- TRAINING ECP MODEL (for notebook helper) ---")
    ecp_model.fit(X_train, y_train)
    
    # 7. Optional: simple validation metrics (printed only)
    try:
        val_probs = ecp_model.predict_proba(X_val)[:, 1]
        print(f"Validation LogLoss: {log_loss(y_val, val_probs):.4f}")
        print(f"Validation AUC: {roc_auc_score(y_val, val_probs):.4f}")
    except Exception:
        # In case predict_proba is not available or any other issue
        pass
    
    # 8. Generate ECP predictions for ALL plays in X
    all_ecp_probs = ecp_model.predict_proba(X)[:, 1]
    
    # 9. Build final output DataFrame: game_id, play_id, ecp
    output_df = play_ids.copy()
    output_df["ecp"] = all_ecp_probs
    
    return output_df

# ==============================================================================
# 3. MAIN EXECUTION BLOCK
# ==============================================================================
if __name__ == "__main__":
    # --- IMPORTANT: SET THIS PATH TO YOUR DATA FOLDER ---
    # Use the shared ../data folder that holds input_2023_w*.csv and output_2023_w*.csv
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
    DATA_PATH = os.path.join(SCRIPT_DIR, "..", "data")
    
    try:
        # 1. Load
        raw_input, raw_output, raw_supp = load_all_data(DATA_PATH)
        
        # 2. Filter (Preprocessing)
        cl_input, cl_output, cl_supp = preprocess_filters(raw_input, raw_output, raw_supp)
        
        # 3. Build Dataset (With Separation)
        X, y, feature_names, play_ids = build_ecp_dataset(cl_input, cl_output, cl_supp)
        
        # 4. Train Model
        print("--- 4. TRAINING ECP MODEL ---")
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        if XGBOOST_AVAILABLE:
            ecp_model = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=4, eval_metric='logloss')
            print("Using XGBoost classifier...")
        else:
            ecp_model = LogisticRegression(max_iter=1000, random_state=42)
            print("Using LogisticRegression classifier...")
        
        ecp_model.fit(X_train, y_train)
        
        # 5. Validate
        probs = ecp_model.predict_proba(X_val)[:, 1]
        print(f"Validation LogLoss: {log_loss(y_val, probs):.4f}")
        print(f"Validation AUC: {roc_auc_score(y_val, probs):.4f}")
        
        # 6. Generate ECP predictions for ALL data
        print("\n--- 5. GENERATING ECP PREDICTIONS FOR ALL DATA ---")
        all_ecp_probs = ecp_model.predict_proba(X)[:, 1]
        
        # Create output DataFrame
        output_df = play_ids.copy()
        output_df['ecp'] = all_ecp_probs
        
        # Save to CSV
        output_filename = "ecp_predictions.csv"
        output_df.to_csv(output_filename, index=False)
        print(f"✓ Saved ECP predictions to {output_filename}")
        print(f"  Total plays: {len(output_df):,}")
        print(f"  ECP range: {output_df['ecp'].min():.4f} - {output_df['ecp'].max():.4f}")
        print(f"  Average ECP: {output_df['ecp'].mean():.4f}")
        
        print("\nSUCCESS: ECP model trained and predictions saved!")
        
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()



In [None]:
import os, sys, importlib

# Make sure Python can see train_ecp_model.py (one folder up)
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.append(project_root)

import train_ecp_model
importlib.reload(train_ecp_model)

# Folder that contains 'supplementary_data.csv', 'input_2023_w*.csv', and 'output_2023_w*.csv'
DATA_PATH = "/kaggle/input/nfl-data"  # Kaggle dataset path

# Build full ECP predictions DataFrame (game_id, play_id, ecp)
ecp_df = train_ecp_model.build_ecp_predictions_dataframe(DATA_PATH)

# Look at it
display(ecp_df.head())

In [None]:
# Attach ECP scores to the **play-level training features** (x_train), not the raw frame-level training_df

if 'x_train' not in globals() or x_train.empty:
    print("x_train is not defined or is empty. Build x_train (play-level features) first.")
elif 'ecp_df' not in globals() or ecp_df.empty:
    print("ecp_df is not defined or is empty. Run the ECP model cell first.")
else:
    print("=" * 80)
    print("MERGING ECP SCORES ONTO x_train PLAYS")
    print("=" * 80)

    # If x_train already has an 'ecp' column from a previous run, drop it
    if "ecp" in x_train.columns:
        x_train = x_train.drop(columns=["ecp"])

    # Merge ECP (play-level) onto play-level x_train
    merged = x_train.merge(
        ecp_df[["game_id", "play_id", "ecp"]],
        on=["game_id", "play_id"],
        how="left",
        validate="many_to_one",
    )

    # Check for any plays without an ECP score
    missing_plays = (
        merged[merged["ecp"].isna()][["game_id", "play_id"]]
        .drop_duplicates()
    )

    n_train_plays_before = x_train[["game_id", "play_id"]].drop_duplicates().shape[0]
    print(f"x_train plays before merge: {n_train_plays_before}")

    if not missing_plays.empty:
        print(f"⚠ {len(missing_plays)} x_train plays do not have an ECP score; dropping those plays.")
        # Keep only plays that have a valid ECP score
        merged = merged[merged["ecp"].notna()].copy()
    else:
        print("✓ All x_train plays have an ECP score.")

    # Overwrite x_train so every play in it has an ECP label
    x_train = merged.reset_index(drop=True)

    # Keep exactly the feature columns we care about (plus ecp)
    desired_cols = [
        "game_id",                 # 1
        "play_id",                 # 2
        "receiver_speed",          # 13
        "receiver_accel",          # 14
        "leverage_angle",          # 16
        "relative_velocity_angle", # 17
        "pressure_score",          # 22 (if present)
        "is_press",                # 24 (if present)
        "shade_encoded",           # 25 (if present)
        "field_control",           # 27 (if present)
        "is_break_frame",          # 28
        "frames_since_break",      # 29
        "continuous_frame",        # frame index
        "ecp",                     # ECP label
    ]

    # Only keep columns that actually exist to avoid KeyErrors
    existing_cols = [c for c in desired_cols if c in x_train.columns]
    x_train = x_train[existing_cols].copy()

    n_train_plays_after = x_train[["game_id", "play_id"]].drop_duplicates().shape[0]
    print(f"x_train plays after merge (with desired columns): {n_train_plays_after}")
    print(f"x_train rows after merge: {len(x_train):,}")
    print("Final x_train columns:", list(x_train.columns))


In [None]:
from IPython.display import display

# Start from the play-level features with ECP
combined_df = x_train.copy()

# Attach actual completion outcome per play (1 = complete, 0 = incomplete)
if 'training_df' in globals() and 'is_complete' in training_df.columns:
    play_outcomes = (
        training_df[['game_id', 'play_id', 'is_complete']]
        .drop_duplicates(subset=['game_id', 'play_id'])
    )
    combined_df = combined_df.merge(
        play_outcomes,
        on=['game_id', 'play_id'],
        how='left'
    )
else:
    print("Warning: could not find 'training_df.is_complete'; continuing without it.")

display(combined_df)



In [None]:
import numpy as np
import pandas as pd

# 1. Define the Input Features (The Physics)
# (These match the columns in your snippet)
feature_cols = [
    'receiver_speed', 
    'receiver_accel', 
    'leverage_angle', 
    'relative_velocity_angle', 
    'pressure_score', 
    'is_press', 
    'shade_encoded',
    'field_control',       # You have this in your snippet, so we include it
    'is_break_frame', 
    'frames_since_break'
]

# 2. Define the Target (What we are learning to predict)
target_col = 'ecp'

In [None]:
def create_dataset(df, feature_cols, target_col, seq_length=50):
    print("Processing DataFrame into 3D Tensor...")
    
    # Sort to ensure time order (Vital!)
    df = df.sort_values(['game_id', 'play_id', 'continuous_frame'])
    
    # Group by Play
    grouped = df.groupby(['game_id', 'play_id'])
    
    sequences = []
    targets = []
    
    for (game, play), group in grouped:
        # A. GET FEATURES (X)
        # Convert the feature columns to a numpy array
        seq_data = group[feature_cols].values
        
        # Handle Length (Pad or Truncate to seq_length)
        curr_len = len(seq_data)
        
        if curr_len >= seq_length:
            # If too long, take the first 50 frames (The Route Stem + Break)
            X_play = seq_data[:seq_length]
        else:
            # If too short, pad with Zeros at the end
            padding = np.zeros((seq_length - curr_len, len(feature_cols)))
            X_play = np.vstack([seq_data, padding])
            
        sequences.append(X_play)
        
        # B. NEW TARGET LOGIC (Residual RDS)
        # Residual = Actual Result (Catch/No Catch) - ECP (the baseline expectation)
        if 'is_complete' not in group.columns or 'ecp' not in group.columns:
            raise ValueError("Dataframe must contain 'is_complete' and 'ecp' columns to compute residual RDS.")
        
        # 1. Get the actual outcome (1 or 0)
        actual_outcome = group['is_complete'].iloc[0]
        
        # 2. Get the baseline ECP from your Logistic Model
        baseline_ecp = group['ecp'].iloc[0]
        
        # 3. The residual: how much the player outperformed (or underperformed) expectation
        y_play = actual_outcome - baseline_ecp
        
        targets.append(y_play)
        
    # Convert lists to Numpy Arrays
    X_out = np.array(sequences)
    y_out = np.array(targets)
    
    print(f"Done! X Shape: {X_out.shape}, y Shape: {y_out.shape}")
    return X_out, y_out

# --- RUN IT ---
# Replace 'training_df' with whatever your dataframe variable is named
X_train, y_train = create_dataset(combined_df, feature_cols, target_col)

In [None]:
# ======================================================================
# UPDATED CLEAN & SCALE: Protecting the Masking(mask_value=0.0) layer
# ======================================================================
import numpy as np
from sklearn.preprocessing import StandardScaler

# Start from the merged play-level dataframe with ECP
clean_df = combined_df.copy()

# Attach actual outcome (is_complete) per play so we can compute residuals
if 'is_complete' not in clean_df.columns:
    if 'training_df' in globals():
        play_outcomes = (
            training_df[['game_id', 'play_id', 'is_complete']]
            .drop_duplicates(subset=['game_id', 'play_id'])
        )
        clean_df = clean_df.merge(play_outcomes, on=['game_id', 'play_id'], how='left')
    else:
        raise ValueError("training_df not found in globals; cannot attach 'is_complete' for residual computation.")

# 1. Replace +/- inf with NaN, then fill NaNs with 0 so the scaler sees only finite values.
#    These zeros are TEMPORARY and will be transformed to non-zero values by StandardScaler.
clean_df.replace([np.inf, -np.inf], np.nan, inplace=True)
clean_df = clean_df.fillna(0)

# 2. Drop plays where the target (ecp) is missing
if 'ecp' in clean_df.columns:
    before_drop = len(clean_df[['game_id', 'play_id']].drop_duplicates())
    clean_df = clean_df.dropna(subset=['ecp'])
    after_drop = len(clean_df[['game_id', 'play_id']].drop_duplicates())
    print(f"Dropped {before_drop - after_drop} plays with missing ECP.")
else:
    raise ValueError("Column 'ecp' not found in combined_df; cannot train without target.")

# 3. Scale ONLY the actual physics data (2D dataframe, before any padding)
scale_cols = [
    'receiver_speed',
    'receiver_accel',
    'pressure_score',
    'field_control',
    'is_break_frame',
    'frames_since_break',
]

scale_cols_existing = [c for c in scale_cols if c in clean_df.columns]
if scale_cols_existing:
    scaler = StandardScaler()
    clean_df[scale_cols_existing] = scaler.fit_transform(clean_df[scale_cols_existing])
    print("Scaled columns:", scale_cols_existing)
else:
    print("No scaleable columns found; skipping scaling.")

# 4. Rebuild X_train, y_train AFTER scaling.
#    create_dataset() will add np.zeros padding for short sequences.
#    Those padding rows remain EXACTLY 0.0 across all features, so
#    Masking(mask_value=0.0) will correctly ignore only the padded timesteps.
X_train, y_train = create_dataset(clean_df, feature_cols, target_col, seq_length=50)

# Quick sanity check
print("Checking X_train for bad numbers...")
print(f"Any NaNs? {np.isnan(X_train).any()}")
print(f"Any Infinity? {np.isinf(X_train).any()}")
print("Checking y_train for bad numbers...")
print(f"Any NaNs? {np.isnan(y_train).any()}")
print(f"Any Infinity? {np.isinf(y_train).any()}")



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Dropout

# 1. Define Architecture
model = Sequential([
    # Layer 1: Masking (Tells LSTM to ignore the Zeros we added for padding)
    Masking(mask_value=0.0, input_shape=(50, len(feature_cols))),
    
    # Layer 2: The LSTM (The Brain)
    # units=64 means it looks for 64 different patterns in the movement
    LSTM(units=64, return_sequences=False),
    
    # Layer 3: Dropout (Prevents Overfitting)
    Dropout(0.2),
    
    # Layer 4: Output (The Score)
    # We want 1 single number (The predicted ECP/Score)
    Dense(1, activation='linear')
])

# 2. Compile (Setup the math)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 3. View Summary
model.summary()

In [None]:
# Train on your reshaped data
# Use a tunable batch size; if you see "Out of Memory" errors, LOWER this number.
batch_size = 64  # Larger batches are faster, smaller batches use less RAM

history = model.fit(
    X_train, y_train, 
    epochs=50, 
    batch_size=batch_size, 
    validation_split=0.2  # Set aside 20% to check accuracy
)

In [None]:
import numpy as np
import pandas as pd

# STEP 5: Predict a score for every play sequence in X_train
rds_scores = model.predict(X_train).flatten()
print("Predictions shape:", rds_scores.shape)

# STEP 6: Build a (game_id, play_id) index in the SAME order as X_train
df_sorted = clean_df.sort_values(['game_id', 'play_id', 'continuous_frame'])
play_index = (
    df_sorted[['game_id', 'play_id']]
    .drop_duplicates()
    .reset_index(drop=True)
)

print("Num plays in index:", len(play_index))

# Sanity check: these should match
assert len(play_index) == len(rds_scores), "Mismatch between plays and predictions!"

# Attach scores to plays
results = play_index.copy()
results['RDS_Score'] = rds_scores

# Attach metadata (player / route / completion, etc.) from training_df
meta_cols = ['game_id', 'play_id', 'target_name', 'is_complete']
meta_cols_existing = [c for c in meta_cols if c in training_df.columns]

play_meta = (
    training_df[meta_cols_existing]
    .drop_duplicates(subset=['game_id', 'play_id'])
)

results = results.merge(play_meta, on=['game_id', 'play_id'], how='left')

display(results.head(10))

In [None]:
highest_rds_row = results.loc[results['RDS_Score'].idxmax()]
print("Player with highest RDS score:")
top5_rds_rows = results.nlargest(5, 'RDS_Score')
display(top5_rds_rows)


In [None]:
# Simple leaderboard: average RDS by receiver
leaderboard = (
    results
    .groupby('target_name', dropna=True)['RDS_Score']
    .mean()
    .reset_index()
    .sort_values('RDS_Score', ascending=False)
)

print("--- TOP 10 RECEIVERS BY RDS ---")
display(leaderboard.head(10))