# EDA and Baseline Model

- I updated the Model part of the notebook based on the two great notebooks below which improved the score from 2.09 to 0.94:
- [NFL Big Data Bowl 2026 - Prediction](https://www.kaggle.com/code/muhammadqasimshabbir/nfl-big-data-bowl-2026-prediction)
- [NFL Big Data - Baseline](https://www.kaggle.com/code/hiwe0305/nfl-big-data-baseline)

## 1. Setup and Imports

In [None]:
## 1 Setup and Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib.patches as patches
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
import os
from glob import glob
from tqdm import tqdm
import gc
from IPython.display import display
from IPython.display import HTML

warnings.filterwarnings('ignore')

# Set visualization defaults
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Color scheme
COLORS = {
    'offense': '#FF6B6B',
    'defense': '#4ECDC4',
    'targeted': '#FFD93D',
    'passer': '#95E77E',
    'other': '#A8DADC'
}


## 2. Data Loading and Initial Exploration

In [None]:
# Define data paths
DATA_PATH = '/kaggle/input/nfl-big-data-bowl-2026-prediction/'
TRAIN_PATH = DATA_PATH + 'train/'

# Get all training files
input_files = sorted(glob(TRAIN_PATH + 'input_*.csv'))
output_files = sorted(glob(TRAIN_PATH + 'output_*.csv'))

print(f"Found {len(input_files)} input files and {len(output_files)} output files")
print("\nSample files:")
print("Input:", input_files[:3])
print("Output:", output_files[:3])

### 2.1 Load Sample Data for Initial Analysis

In [None]:
# Load first week's data for initial exploration
sample_input = pd.read_csv(input_files[0])
sample_output = pd.read_csv(output_files[0])

print("Sample Input Data Shape:", sample_input.shape)
print("Sample Output Data Shape:", sample_output.shape)
print("\n" + "="*50 + "\n")

# Display basic info
print("INPUT DATA STRUCTURE:")
print(sample_input.info())
print("\n" + "="*50 + "\n")
print("OUTPUT DATA STRUCTURE:")
print(sample_output.info())

### 2.2 Data Overview and Statistics

In [None]:
# Display first few rows
print("SAMPLE INPUT DATA:")
print(sample_input.head())
print("\n" + "="*50 + "\n")
print("SAMPLE OUTPUT DATA:")
print(sample_output.head())

# Basic statistics for input data
print("INPUT DATA STATISTICS:")
print(sample_input.describe())

# Check for missing values
print("MISSING VALUES IN INPUT DATA:")
missing_input = sample_input.isnull().sum()
missing_input_pct = 100 * missing_input / len(sample_input)
missing_df = pd.DataFrame({
    'Missing_Count': missing_input,
    'Percentage': missing_input_pct
})
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Percentage', ascending=False))

## 3. Feature Analysis

### 3.1 Player Characteristics

In [None]:
# Analyze player positions
position_counts = sample_input['player_position'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Position distribution
axes[0].barh(position_counts.index[:15], position_counts.values[:15])
axes[0].set_xlabel('Count')
axes[0].set_title('Top 15 Player Positions')
axes[0].invert_yaxis()

# Position by side
position_side = pd.crosstab(sample_input['player_position'], sample_input['player_side'])
position_side.plot(kind='bar', stacked=True, ax=axes[1], color=[COLORS['offense'], COLORS['defense']])
axes[1].set_title('Positions by Team Side')
axes[1].set_xlabel('Position')
axes[1].set_ylabel('Count')
axes[1].legend(title='Side')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Player roles distribution
role_counts = sample_input['player_role'].value_counts()

plt.figure(figsize=(10, 6))
colors_role = [COLORS.get(role.lower().split()[0], COLORS['other']) for role in role_counts.index]
plt.pie(role_counts.values, labels=role_counts.index, autopct='%1.1f%%', colors=colors_role)
plt.title('Distribution of Player Roles')
plt.show()

print("Player Role Counts:")
print(role_counts)

### 3.2 Physical Attributes Analysis

In [None]:
# Convert height to inches for analysis
def height_to_inches(height_str):
    if pd.isna(height_str):
        return np.nan
    try:
        feet, inches = height_str.split('-')
        return int(feet) * 12 + int(inches)
    except:
        return np.nan

sample_input['height_inches'] = sample_input['player_height'].apply(height_to_inches)

# Create subplots for physical attributes
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Height distribution by position
top_positions = sample_input['player_position'].value_counts().head(10).index
height_by_pos = sample_input[sample_input['player_position'].isin(top_positions)].groupby('player_position')['height_inches'].mean().sort_values()
axes[0, 0].barh(height_by_pos.index, height_by_pos.values)
axes[0, 0].set_xlabel('Average Height (inches)')
axes[0, 0].set_title('Average Height by Position (Top 10)')

# Weight distribution by position
weight_by_pos = sample_input[sample_input['player_position'].isin(top_positions)].groupby('player_position')['player_weight'].mean().sort_values()
axes[0, 1].barh(weight_by_pos.index, weight_by_pos.values)
axes[0, 1].set_xlabel('Average Weight (lbs)')
axes[0, 1].set_title('Average Weight by Position (Top 10)')

# Height vs Weight scatter
axes[1, 0].scatter(sample_input['height_inches'], sample_input['player_weight'], alpha=0.5)
axes[1, 0].set_xlabel('Height (inches)')
axes[1, 0].set_ylabel('Weight (lbs)')
axes[1, 0].set_title('Height vs Weight Correlation')

# BMI-like metric distribution
sample_input['bmi_like'] = sample_input['player_weight'] / (sample_input['height_inches'] ** 2) * 703
axes[1, 1].hist(sample_input['bmi_like'].dropna(), bins=30, edgecolor='black')
axes[1, 1].set_xlabel('BMI')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('BMI Distribution')

plt.tight_layout()
plt.show()

### 3.3 Movement Dynamics

In [None]:
# Analyze speed and acceleration
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed distribution
axes[0, 0].hist(sample_input['s'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Speed (yards/second)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Speed Distribution')
axes[0, 0].axvline(sample_input['s'].mean(), color='red', linestyle='--', label=f'Mean: {sample_input["s"].mean():.2f}')
axes[0, 0].legend()

# Acceleration distribution
axes[0, 1].hist(sample_input['a'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Acceleration (yards/second²)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Acceleration Distribution')
axes[0, 1].axvline(sample_input['a'].mean(), color='red', linestyle='--', label=f'Mean: {sample_input["a"].mean():.2f}')
axes[0, 1].legend()

# Speed by player role
speed_by_role = sample_input.groupby('player_role')['s'].agg(['mean', 'std'])
axes[1, 0].bar(speed_by_role.index, speed_by_role['mean'], yerr=speed_by_role['std'], capsize=5)
axes[1, 0].set_xlabel('Player Role')
axes[1, 0].set_ylabel('Speed (yards/second)')
axes[1, 0].set_title('Average Speed by Player Role')
axes[1, 0].tick_params(axis='x', rotation=45)
for tick in axes[1, 0].get_xticklabels():
    tick.set_rotation(45)
    tick.set_ha('right')

# Speed vs Acceleration
axes[1, 1].hexbin(sample_input['s'], sample_input['a'], gridsize=30, cmap='YlOrRd')
axes[1, 1].set_xlabel('Speed (yards/second)')
axes[1, 1].set_ylabel('Acceleration (yards/second²)')
axes[1, 1].set_title('Speed vs Acceleration Heatmap')
cbar = plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1])
cbar.set_label('Count')

plt.tight_layout()

## 4. Spatial Analysis

### 4.1 Field Position Analysis

In [None]:
# Analyze player positions on the field
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Starting positions heatmap
axes[0].hexbin(sample_input['x'], sample_input['y'], gridsize=30, cmap='Greens')
axes[0].set_xlabel('X Position (yards)')
axes[0].set_ylabel('Y Position (yards)')
axes[0].set_title('Player Starting Positions Heatmap')
axes[0].set_xlim(0, 120)
axes[0].set_ylim(0, 53.3)
plt.colorbar(axes[0].collections[0], ax=axes[0], label='Count')

# Ball landing positions
axes[1].scatter(sample_input['ball_land_x'], sample_input['ball_land_y'], alpha=0.3, s=1)
axes[1].set_xlabel('Ball Landing X (yards)')
axes[1].set_ylabel('Ball Landing Y (yards)')
axes[1].set_title('Ball Landing Positions')
axes[1].set_xlim(0, 120)
axes[1].set_ylim(0, 53.3)

plt.tight_layout()
plt.show()

# Analyze distance to ball landing position
sample_input['dist_to_ball'] = np.sqrt(
    (sample_input['x'] - sample_input['ball_land_x'])**2 + 
    (sample_input['y'] - sample_input['ball_land_y'])**2
)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distance distribution by role
dist_by_role = sample_input.groupby('player_role')['dist_to_ball'].mean().sort_values()
axes[0].barh(dist_by_role.index, dist_by_role.values)
axes[0].set_xlabel('Average Distance to Ball Landing (yards)')
axes[0].set_title('Initial Distance to Ball Landing by Role')

# Distance distribution
axes[1].hist(sample_input['dist_to_ball'].dropna(), bins=50, edgecolor='black')
axes[1].set_xlabel('Distance to Ball Landing (yards)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Initial Distance to Ball')
axes[1].axvline(sample_input['dist_to_ball'].mean(), color='red', linestyle='--', 
                label=f'Mean: {sample_input["dist_to_ball"].mean():.2f} yards')
axes[1].legend()

plt.tight_layout()
plt.show()

### 4.2 Directional Analysis

In [None]:
# Analyze orientation and direction
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Orientation distribution (polar plot would be better but using histogram for simplicity)
axes[0, 0].hist(sample_input['o'].dropna(), bins=36, edgecolor='black')
axes[0, 0].set_xlabel('Orientation (degrees)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Player Orientation Distribution')

# Direction of motion distribution
axes[0, 1].hist(sample_input['dir'].dropna(), bins=36, edgecolor='black')
axes[0, 1].set_xlabel('Direction (degrees)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Direction of Motion Distribution')

# Orientation vs Direction scatter
axes[1, 0].scatter(sample_input['o'], sample_input['dir'], alpha=0.3, s=1)
axes[1, 0].set_xlabel('Orientation (degrees)')
axes[1, 0].set_ylabel('Direction (degrees)')
axes[1, 0].set_title('Orientation vs Direction')

# Angular difference between orientation and direction
angular_diff = np.abs(sample_input['o'] - sample_input['dir'])
angular_diff = np.minimum(angular_diff, 360 - angular_diff)  # Handle circular difference
axes[1, 1].hist(angular_diff.dropna(), bins=50, edgecolor='black')
axes[1, 1].set_xlabel('Angular Difference (degrees)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Difference between Orientation and Direction')

plt.tight_layout()
plt.show()

## 5. Temporal Analysis

### 5.1 Frame Analysis

In [None]:
# Analyze number of frames to predict
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distribution of frames to predict
axes[0].hist(sample_input['num_frames_output'].dropna(), bins=30, edgecolor='black')
axes[0].set_xlabel('Number of Frames to Predict')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Prediction Length')
axes[0].axvline(sample_input['num_frames_output'].mean(), color='red', linestyle='--', 
                label=f'Mean: {sample_input["num_frames_output"].mean():.1f} frames')
axes[0].legend()

# Convert frames to time (10 frames per second)
sample_input['time_in_air'] = sample_input['num_frames_output'] / 10
axes[1].hist(sample_input['time_in_air'].dropna(), bins=30, edgecolor='black')
axes[1].set_xlabel('Time Ball in Air (seconds)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Ball Flight Time')
axes[1].axvline(sample_input['time_in_air'].mean(), color='red', linestyle='--', 
                label=f'Mean: {sample_input["time_in_air"].mean():.2f} seconds')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Statistics for Ball Flight Time:")
print(f"Minimum: {sample_input['time_in_air'].min():.2f} seconds")
print(f"Maximum: {sample_input['time_in_air'].max():.2f} seconds")
print(f"Mean: {sample_input['time_in_air'].mean():.2f} seconds")
print(f"Median: {sample_input['time_in_air'].median():.2f} seconds")

### 5.2 Play Analysis

In [None]:
# Analyze plays
unique_plays = sample_input.groupby(['game_id', 'play_id']).agg({
    'num_frames_output': 'first',
    'play_direction': 'first',
    'absolute_yardline_number': 'first',
    'ball_land_x': 'first',
    'ball_land_y': 'first'
}).reset_index()

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Play direction distribution
direction_counts = unique_plays['play_direction'].value_counts()
axes[0, 0].pie(direction_counts.values, labels=direction_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Play Direction Distribution')

# Field position distribution
axes[0, 1].hist(unique_plays['absolute_yardline_number'].dropna(), bins=30, edgecolor='black')
axes[0, 1].set_xlabel('Absolute Yardline Number')
axes[0, 1].set_ylabel('Number of Plays')
axes[0, 1].set_title('Starting Field Position Distribution')

# Pass distance calculation
unique_plays['pass_distance'] = np.abs(unique_plays['ball_land_x'] - unique_plays['absolute_yardline_number'])
axes[1, 0].hist(unique_plays['pass_distance'].dropna(), bins=30, edgecolor='black')
axes[1, 0].set_xlabel('Pass Distance (yards)')
axes[1, 0].set_ylabel('Number of Plays')
axes[1, 0].set_title('Pass Distance Distribution')

# Pass distance vs flight time
axes[1, 1].scatter(unique_plays['pass_distance'], unique_plays['num_frames_output']/10, alpha=0.5)
axes[1, 1].set_xlabel('Pass Distance (yards)')
axes[1, 1].set_ylabel('Flight Time (seconds)')
axes[1, 1].set_title('Pass Distance vs Flight Time')

plt.tight_layout()
plt.show()

## 6. Target Variable Analysis

### 6.1 Movement Patterns

In [None]:
# Merge a sample of input and output data to analyze movement
sample_play = sample_input[sample_input['play_id'] == sample_input['play_id'].iloc[0]].copy()
sample_play_output = sample_output[sample_output['play_id'] == sample_input['play_id'].iloc[0]].copy()

# Calculate movement statistics for players
movement_stats = []

for nfl_id in sample_play['nfl_id'].unique():
    player_input = sample_play[sample_play['nfl_id'] == nfl_id].iloc[0]
    player_output = sample_play_output[sample_play_output['nfl_id'] == nfl_id]
    
    if len(player_output) > 0:
        # Calculate total distance moved
        final_x = player_output.iloc[-1]['x'] if len(player_output) > 0 else player_input['x']
        final_y = player_output.iloc[-1]['y'] if len(player_output) > 0 else player_input['y']
        
        distance_moved = np.sqrt(
            (final_x - player_input['x'])**2 + 
            (final_y - player_input['y'])**2
        )
        
        movement_stats.append({
            'nfl_id': nfl_id,
            'role': player_input['player_role'],
            'initial_x': player_input['x'],
            'initial_y': player_input['y'],
            'final_x': final_x,
            'final_y': final_y,
            'distance_moved': distance_moved,
            'num_frames': len(player_output)
        })

movement_df = pd.DataFrame(movement_stats)

# Visualize movement patterns
if len(movement_df) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    role_colors = {
        'Targeted Receiver': COLORS['targeted'],
        'Defensive Coverage': COLORS['defense'],
        'Other Route Runner': COLORS['other'],
        'Passer': COLORS['passer']
    }
    
    for role, color in role_colors.items():
        role_data = movement_df[movement_df['role'] == role]
        if len(role_data) > 0:
            ax.bar(role, role_data['distance_moved'].mean(), 
                  yerr=role_data['distance_moved'].std(), 
                  color=color, capsize=5, label=role)
    
    ax.set_xlabel('Player Role')
    ax.set_ylabel('Average Distance Moved (yards)')
    ax.set_title('Average Movement Distance by Player Role During Pass')
    ax.legend()
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 7. Feature Engineering Ideas

### 7.1 Potential Features for Modeling

In [None]:
# Create sample engineered features
sample_features = sample_input.copy()

# Distance-based features
sample_features['dist_to_ball'] = np.sqrt(
    (sample_features['x'] - sample_features['ball_land_x'])**2 + 
    (sample_features['y'] - sample_features['ball_land_y'])**2
)

# Angle to ball
sample_features['angle_to_ball'] = np.arctan2(
    sample_features['ball_land_y'] - sample_features['y'],
    sample_features['ball_land_x'] - sample_features['x']
) * 180 / np.pi

# Velocity components
sample_features['vx'] = sample_features['s'] * np.cos(sample_features['dir'] * np.pi / 180)
sample_features['vy'] = sample_features['s'] * np.sin(sample_features['dir'] * np.pi / 180)

# Acceleration components
sample_features['ax'] = sample_features['a'] * np.cos(sample_features['dir'] * np.pi / 180)
sample_features['ay'] = sample_features['a'] * np.sin(sample_features['dir'] * np.pi / 180)

# Is targeted receiver
sample_features['is_targeted'] = (sample_features['player_role'] == 'Targeted Receiver').astype(int)

# Is defender
sample_features['is_defender'] = (sample_features['player_side'] == 'Defense').astype(int)

# Display correlation matrix for engineered features
feature_cols = ['dist_to_ball', 'angle_to_ball', 's', 'a', 'vx', 'vy', 'ax', 'ay', 
                'is_targeted', 'is_defender', 'num_frames_output']

correlation_matrix = sample_features[feature_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

### 7.2 Feature Importance Indicators

In [None]:
# Analyze which features might be most predictive
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distance to ball by whether player is targeted
targeted = sample_features[sample_features['is_targeted'] == 1]['dist_to_ball']
not_targeted = sample_features[sample_features['is_targeted'] == 0]['dist_to_ball']

axes[0, 0].hist([targeted, not_targeted], bins=30, label=['Targeted', 'Not Targeted'], alpha=0.7)
axes[0, 0].set_xlabel('Distance to Ball (yards)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distance to Ball: Targeted vs Non-Targeted Players')
axes[0, 0].legend()

# Speed by player role
speed_by_role_targeted = sample_features.groupby('is_targeted')['s'].mean()
axes[0, 1].bar(['Not Targeted', 'Targeted'], speed_by_role_targeted.values)
axes[0, 1].set_ylabel('Average Speed (yards/second)')
axes[0, 1].set_title('Average Speed: Targeted vs Non-Targeted')

# Velocity towards ball
sample_features['velocity_towards_ball'] = (
    sample_features['vx'] * np.cos(sample_features['angle_to_ball'] * np.pi / 180) +
    sample_features['vy'] * np.sin(sample_features['angle_to_ball'] * np.pi / 180)
)

axes[1, 0].hist(sample_features['velocity_towards_ball'].dropna(), bins=50, edgecolor='black')
axes[1, 0].set_xlabel('Velocity Towards Ball (yards/second)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Velocity Towards Ball')
axes[1, 0].axvline(0, color='red', linestyle='--', alpha=0.5)

# Relationship between initial distance and frames to predict
axes[1, 1].hexbin(sample_features['dist_to_ball'], sample_features['num_frames_output'], 
                   gridsize=20, cmap='YlOrRd')
axes[1, 1].set_xlabel('Initial Distance to Ball (yards)')
axes[1, 1].set_ylabel('Number of Frames to Predict')
axes[1, 1].set_title('Distance vs Prediction Length')
plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1], label='Count')

plt.tight_layout()
plt.show()

## 8. Data Quality and Challenges
### 8.1 Data Quality Checks

In [None]:
# Check for potential data quality issues
quality_checks = {
    'Plays with missing ball location': (sample_input['ball_land_x'].isna() | sample_input['ball_land_y'].isna()).sum(),
    'Players with zero speed': (sample_input['s'] == 0).sum(),
    'Players outside field bounds (x)': ((sample_input['x'] < 0) | (sample_input['x'] > 120)).sum(),
    'Players outside field bounds (y)': ((sample_input['y'] < 0) | (sample_input['y'] > 53.3)).sum(),
    'Negative num_frames_output': (sample_input['num_frames_output'] < 0).sum(),
    'Missing player attributes': sample_input[['player_height', 'player_weight']].isna().any(axis=1).sum()
}

quality_df = pd.DataFrame(list(quality_checks.items()), columns=['Check', 'Count'])
quality_df['Percentage'] = 100 * quality_df['Count'] / len(sample_input)

print("DATA QUALITY CHECKS:")
print(quality_df)

# Analyze edge cases
print("EDGE CASES ANALYSIS:")
print("-" * 50)
print(f"Shortest pass duration: {sample_input['num_frames_output'].min()} frames ({sample_input['num_frames_output'].min()/10:.1f} seconds)")
print(f"Longest pass duration: {sample_input['num_frames_output'].max()} frames ({sample_input['num_frames_output'].max()/10:.1f} seconds)")
print(f"Minimum pass distance: {unique_plays['pass_distance'].min():.1f} yards")
print(f"Maximum pass distance: {unique_plays['pass_distance'].max():.1f} yards")
print(f"Players per play - Min: {sample_input.groupby(['game_id', 'play_id'])['nfl_id'].nunique().min()}")
print(f"Players per play - Max: {sample_input.groupby(['game_id', 'play_id'])['nfl_id'].nunique().max()}")

### 8.2 Modeling Challenges

In [None]:
# Identify modeling challenges
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Variability in number of frames to predict
frames_std = sample_input.groupby('player_role')['num_frames_output'].std()
axes[0].bar(frames_std.index, frames_std.values)
axes[0].set_xlabel('Player Role')
axes[0].set_ylabel('Std Dev of Frames to Predict')
axes[0].set_title('Prediction Length Variability by Role')
for tick in axes[0].get_xticklabels():
    tick.set_rotation(45)
    tick.set_ha('right')

# Class imbalance for player_to_predict
predict_counts = sample_input['player_to_predict'].value_counts()
axes[1].pie(predict_counts.values, labels=['Not Predicted', 'Predicted'], 
           autopct='%1.1f%%', colors=['lightgray', 'orange'])
axes[1].set_title('Class Balance: Players to Predict')

plt.tight_layout()
plt.show()

## 9. Interactive Visualizations
### 9.1 Sample Play Visualization

In [None]:
# Create a static visualization of a sample play
def visualize_play_trajectory(input_df, output_df, game_id, play_id, max_players=10):
    """Visualize player trajectories for a specific play"""
    
    # Filter for specific play
    play_input = input_df[(input_df['game_id'] == game_id) & (input_df['play_id'] == play_id)]
    play_output = output_df[(output_df['game_id'] == game_id) & (output_df['play_id'] == play_id)]
    
    if len(play_input) == 0:
        print(f"No data found for game_id={game_id}, play_id={play_id}")
        return
    
    # Get ball landing position
    ball_x = play_input['ball_land_x'].iloc[0]
    ball_y = play_input['ball_land_y'].iloc[0]
    
    # Create figure
    fig, ax = plt.subplots(figsize=(14, 7))
    
    # Add field background
    field_rect = plt.Rectangle((0, 0), 120, 53.3, 
                              linewidth=2, edgecolor='green', 
                              facecolor='lightgreen', alpha=0.3)
    ax.add_patch(field_rect)
    
    # Add yard lines for reference
    for yard in range(10, 120, 10):
        ax.axvline(x=yard, color='white', alpha=0.3, linestyle='--', linewidth=0.5)
    
    # Add ball landing position
    ax.scatter(ball_x, ball_y, s=150, c='brown', marker='X', 
              edgecolors='black', linewidth=2, label='Ball Landing', zorder=5)
    
    # Plot player trajectories
    player_count = 0
    for nfl_id in play_input['nfl_id'].unique()[:max_players]:
        player_in = play_input[play_input['nfl_id'] == nfl_id].iloc[0]
        player_out = play_output[play_output['nfl_id'] == nfl_id].sort_values('frame_id')
        
        if len(player_out) > 0:
            # Create trajectory
            x_coords = [player_in['x']] + player_out['x'].tolist()
            y_coords = [player_in['y']] + player_out['y'].tolist()
            
            # Determine color based on role
            if player_in['player_role'] == 'Targeted Receiver':
                color = 'gold'
            elif player_in['player_side'] == 'Defense':
                color = 'red'
            else:
                color = 'blue'
            
            # Plot trajectory
            ax.plot(x_coords, y_coords, color=color, linewidth=2, 
                   marker='o', markersize=4, alpha=0.7,
                   label=f"{player_in['player_position']} ({player_in['player_role']})")
            
            # Add starting position marker
            ax.scatter(player_in['x'], player_in['y'], 
                      s=100, c=color, marker='o', 
                      edgecolors='black', linewidth=1.5, zorder=4)
            
            player_count += 1
    
    # Set layout
    ax.set_xlim(0, 120)
    ax.set_ylim(0, 53.3)
    ax.set_xlabel('Field Position X (yards)', fontsize=12)
    ax.set_ylabel('Field Position Y (yards)', fontsize=12)
    ax.set_title(f'Player Trajectories - Game {game_id}, Play {play_id}', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.2)
    ax.set_aspect('equal')
    
    # Add legend outside the plot area
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9)
    
    plt.tight_layout()
    return fig

# Visualize a sample play
sample_game_id = sample_input['game_id'].iloc[0]
sample_play_id = sample_input['play_id'].iloc[0]
fig = visualize_play_trajectory(sample_input, sample_output, sample_game_id, sample_play_id)
if fig:
    plt.show()

In [None]:
# Create an animated visualization of a play
def create_play_animation(input_df, output_df, game_id, play_id, max_players=10):
    """Create an animated visualization of player trajectories for a specific play"""
    
    # Filter for specific play
    play_input = input_df[(input_df['game_id'] == game_id) & (input_df['play_id'] == play_id)]
    play_output = output_df[(output_df['game_id'] == game_id) & (output_df['play_id'] == play_id)]
    
    if len(play_input) == 0:
        print(f"No data found for game_id={game_id}, play_id={play_id}")
        return None
    
    # Get ball landing position
    ball_x = play_input['ball_land_x'].iloc[0]
    ball_y = play_input['ball_land_y'].iloc[0]
    
    # Get max frames
    max_frame = int(play_output['frame_id'].max())
    
    # Prepare player data
    players_to_plot = play_input['nfl_id'].unique()[:max_players]
    
    # Create figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Store player colors and sizes
    player_styles = {}
    for nfl_id in players_to_plot:
        player_in = play_input[play_input['nfl_id'] == nfl_id].iloc[0]
        if player_in['player_role'] == 'Targeted Receiver':
            player_styles[nfl_id] = {'color': 'gold', 'size': 150, 'marker': 'o'}
        elif player_in['player_side'] == 'Defense':
            player_styles[nfl_id] = {'color': 'red', 'size': 100, 'marker': 's'}
        else:
            player_styles[nfl_id] = {'color': 'blue', 'size': 100, 'marker': 'o'}
    
    # Initialize plot elements
    player_points = {}
    player_trails = {}
    
    def init():
        """Initialize animation"""
        ax.clear()
        
        # Add field background
        field_rect = patches.Rectangle((0, 0), 120, 53.3, 
                                      linewidth=2, edgecolor='green', 
                                      facecolor='lightgreen', alpha=0.3)
        ax.add_patch(field_rect)
        
        # Add yard lines
        for yard in range(10, 120, 10):
            ax.axvline(x=yard, color='white', alpha=0.3, linestyle='--', linewidth=0.5)
        
        # Add ball landing position
        ax.scatter(ball_x, ball_y, s=200, c='brown', marker='X', 
                  edgecolors='black', linewidth=2, zorder=5, alpha=0.7)
        
        # Set limits and labels
        ax.set_xlim(-2, 122)
        ax.set_ylim(-2, 55.3)
        ax.set_xlabel('Field Position X (yards)', fontsize=12)
        ax.set_ylabel('Field Position Y (yards)', fontsize=12)
        ax.grid(True, alpha=0.2)
        ax.set_aspect('equal')
        
        return []
    
    def animate(frame):
        """Animation function"""
        # Clear previous points and trails
        for collection in ax.collections[2:]:  # Keep field and ball, remove others
            collection.remove()
        for line in ax.lines[1:]:  # Keep yard lines, remove trails
            line.remove()
        
        # Update title with time
        time_seconds = frame / 10.0  # 10 frames per second
        ax.set_title(f'Player Movement - Game {game_id}, Play {play_id} | Time: {time_seconds:.1f}s', 
                    fontsize=14, fontweight='bold')
        
        # Plot each player
        for nfl_id in players_to_plot:
            player_in = play_input[play_input['nfl_id'] == nfl_id].iloc[0]
            player_frames = play_output[play_output['nfl_id'] == nfl_id]
            style = player_styles[nfl_id]
            
            if frame == 0:
                # Start position
                x, y = player_in['x'], player_in['y']
                trail_x, trail_y = [x], [y]
            else:
                # Get all positions up to current frame
                current_frames = player_frames[player_frames['frame_id'] <= frame]
                if len(current_frames) > 0:
                    # Current position
                    current_pos = current_frames.iloc[-1]
                    x, y = current_pos['x'], current_pos['y']
                    
                    # Trail (all previous positions)
                    trail_x = [player_in['x']] + current_frames['x'].tolist()
                    trail_y = [player_in['y']] + current_frames['y'].tolist()
                else:
                    # If no data for this frame, use last known position
                    x, y = player_in['x'], player_in['y']
                    trail_x, trail_y = [x], [y]
            
            # Draw trail
            if len(trail_x) > 1:
                ax.plot(trail_x, trail_y, color=style['color'], 
                       linewidth=1.5, alpha=0.4, zorder=2)
            
            # Draw current position
            ax.scatter(x, y, s=style['size'], c=style['color'], 
                      marker=style['marker'], edgecolors='black', 
                      linewidth=1.5, zorder=4, alpha=0.9)
        
        # Add frame counter
        ax.text(0.02, 0.98, f'Frame: {frame}/{max_frame}', 
               transform=ax.transAxes, fontsize=10,
               verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        return ax.collections + ax.lines
    
    # Create animation
    anim = FuncAnimation(fig, animate, init_func=init, 
                        frames=range(0, max_frame + 1),
                        interval=100,  # 100ms between frames (10 fps)
                        blit=False, repeat=True)
    
    plt.close(fig)  # Prevent static display
    return anim

# Create and display animation
sample_game_id = sample_input['game_id'].iloc[0]
sample_play_id = sample_input['play_id'].iloc[0]

print(f"Creating animation for Game {sample_game_id}, Play {sample_play_id}")
print("Animation will play below...")

anim = create_play_animation(sample_input, sample_output, sample_game_id, sample_play_id, max_players=15)

# This is the key line - display as HTML/JavaScript animation
if anim:
    display(HTML(anim.to_jshtml()))

## 10. Summary and Key Insights

In [None]:
print("=" * 60)
print("NFL BIG DATA BOWL 2026 - EDA SUMMARY")
print("=" * 60)

summary_stats = {
    'Total Input Records': len(sample_input),
    'Unique Games': sample_input['game_id'].nunique(),
    'Unique Plays': len(sample_input.groupby(['game_id', 'play_id'])),
    'Unique Players': sample_input['nfl_id'].nunique(),
    'Average Players per Play': sample_input.groupby(['game_id', 'play_id'])['nfl_id'].nunique().mean(),
    'Average Ball Flight Time': f"{sample_input['num_frames_output'].mean()/10:.2f} seconds",
    'Average Pass Distance': f"{unique_plays['pass_distance'].mean():.1f} yards",
    'Percentage of Targeted Receivers': f"{100 * (sample_input['player_role'] == 'Targeted Receiver').mean():.1f}%",
    'Percentage of Players to Predict': f"{100 * sample_input['player_to_predict'].mean():.1f}%"
}

for key, value in summary_stats.items():
    print(f"{key}: {value}")

## 11. Baseline Model

In [None]:
# NFL Big Data Bowl 2026 - Complete Pipeline (Fixed Version)
# Handles the subset of plays that have output labels

import os
import warnings
import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
import pickle
import json
import gc
from datetime import datetime
from pathlib import Path
import glob
from sklearn.model_selection import KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

warnings.filterwarnings('ignore')

# ========== CONFIGURATION ==========
class Config:
    DATA_DIR = "/kaggle/input/nfl-big-data-bowl-2026-prediction/"
    OUTPUT_BASE = "/kaggle/working/"
    OUTPUT_DIR = os.path.join(OUTPUT_BASE, "nfl2026-preprocessed")
    MODEL_DIR = os.path.join(OUTPUT_DIR, "tree-models")
    PROCESSED_DIR = os.path.join(OUTPUT_DIR, "processed-data-trees")
    
    SEED = 42
    N_FOLDS = 5
    USE_GROUP_KFOLD = True
    EARLY_STOPPING_ROUNDS = 100
    ALGORITHMS = ['lightgbm', 'xgboost', 'catboost']
    
    LGBM_PARAMS = {
        'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt',
        'n_estimators': 2000, 'learning_rate': 0.01, 'num_leaves': 31,
        'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 1,
        'min_data_in_leaf': 20, 'lambda_l1': 1.0, 'lambda_l2': 1.0,
        'verbose': -1, 'n_jobs': -1, 'random_state': SEED
    }
    
    XGB_PARAMS = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'n_estimators': 2000, 'learning_rate': 0.01, 'max_depth': 6,
        'colsample_bytree': 0.8, 'subsample': 0.8, 'gamma': 0.1,
        'alpha': 1.0, 'reg_lambda': 1.0, 'n_jobs': -1,
        'random_state': SEED, 'tree_method': 'hist', 'verbosity': 0
    }
    
    CAT_PARAMS = {
        'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'iterations': 2000,
        'learning_rate': 0.01, 'depth': 6, 'l2_leaf_reg': 3.0,
        'subsample': 0.8, 'colsample_bylevel': 0.8,
        'random_seed': SEED, 'verbose': False, 'thread_count': -1
    }

# ========== UTILITY FUNCTIONS ==========
def set_seed(seed=42):
    np.random.seed(seed)
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def create_directories():
    for dir_path in [Config.OUTPUT_DIR, Config.MODEL_DIR, Config.PROCESSED_DIR]:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
    for algo in Config.ALGORITHMS:
        for fold in range(Config.N_FOLDS):
            fold_dir = Path(Config.MODEL_DIR) / algo / f"fold_{fold}"
            fold_dir.mkdir(parents=True, exist_ok=True)

# ========== FEATURE ENGINEERING ==========
def add_temporal_features(df):
    if 'dir' not in df.columns or 's' not in df.columns:
        return df
    df = df.with_columns([
        (pl.col("s") * (pl.col("dir") * np.pi / 180).sin()).alias("speed_x"),
        (pl.col("s") * (pl.col("dir") * np.pi / 180).cos()).alias("speed_y")
    ])
    if 'player_weight' in df.columns:
        df = df.with_columns([
            (pl.col("speed_x") * (pl.col("player_weight").fill_null(200) / 2.20462)).alias("momentum_x"),
            (pl.col("speed_y") * (pl.col("player_weight").fill_null(200) / 2.20462)).alias("momentum_y")
        ])
    else:
        df = df.with_columns([
            (pl.col("speed_x") * 90.7).alias("momentum_x"),
            (pl.col("speed_y") * 90.7).alias("momentum_y")
        ])
    return df

def get_target_receiver_info(df):
    if 'player_role' not in df.columns:
        return pl.DataFrame()
    target_receivers = df.filter(pl.col("player_role") == "Targeted Receiver")
    if target_receivers.height == 0:
        return pl.DataFrame()
    return target_receivers.group_by(["game_id", "play_id"]).agg([
        pl.col("x").last().alias("target_x_last"),
        pl.col("y").last().alias("target_y_last"),
        pl.col("s").last().alias("target_speed_last"),
        pl.col("player_position").first().alias("target_position")
    ])

def calculate_temporal_aggregates(df_input):
    df_temporal = add_temporal_features(df_input.clone())
    features = ["x", "y", "s", "a", "o", "dir", "speed_x", "speed_y", "momentum_x", "momentum_y"]
    features = [f for f in features if f in df_temporal.columns]
    agg_exprs = [pl.col("frame_id").count().alias("num_input_frames")]
    for feat in features:
        agg_exprs.extend([
            pl.col(feat).mean().alias(f"{feat}_mean"),
            pl.col(feat).std().alias(f"{feat}_std"),
            pl.col(feat).min().alias(f"{feat}_min"),
            pl.col(feat).max().alias(f"{feat}_max"),
            pl.col(feat).quantile(0.25).alias(f"{feat}_q25"),
            pl.col(feat).quantile(0.75).alias(f"{feat}_q75"),
            pl.col(feat).skew().fill_null(0).alias(f"{feat}_skew"),
            pl.col(feat).kurtosis().fill_null(0).alias(f"{feat}_kurt"),
            pl.col(feat).last().alias(f"{feat}_last"),
            (pl.col(feat).max() - pl.col(feat).min()).alias(f"{feat}_range")
        ])
    return df_temporal.group_by(["game_id", "play_id", "nfl_id"]).agg(agg_exprs)

def encode_categoricals(df, cat_cols, fit=True, encoders=None):
    df_encoded = df.clone()
    if fit:
        encoders = {}
    for col in cat_cols:
        if col in df.columns:
            col_series = df_encoded.select(pl.col(col).fill_null("unknown").cast(pl.Utf8)).to_pandas()[col]
            if fit:
                le = LabelEncoder()
                encoded = le.fit_transform(col_series)
                encoders[col] = le
            else:
                le = encoders[col]
                encoded = []
                for val in col_series:
                    encoded.append(le.transform([val])[0] if val in le.classes_ else -1)
            df_encoded = df_encoded.with_columns(pl.Series(f"{col}_encoded", encoded))
    return df_encoded, encoders

# ========== TRAINING PIPELINE ==========
print("="*60)
print(" NFL BIG DATA BOWL 2026 - COMPLETE PIPELINE")
print("="*60)
print("Phase 1: Training Models")
print("-"*60)

set_seed(Config.SEED)
create_directories()

# Load training data
print("Loading training data...")
all_inputs, all_outputs = [], []
train_dir = os.path.join(Config.DATA_DIR, "train")

for week in range(1, 19):
    input_file = os.path.join(train_dir, f"input_2023_w{week:02d}.csv")
    output_file = os.path.join(train_dir, f"output_2023_w{week:02d}.csv")
    if os.path.exists(input_file) and os.path.exists(output_file):
        all_inputs.append(pl.read_csv(input_file))
        all_outputs.append(pd.read_csv(output_file))

train_input = pl.concat(all_inputs)
train_output = pd.concat(all_outputs, ignore_index=True)
print(f"  Input shape: {train_input.shape}, Output shape: {train_output.shape}")

# IMPORTANT: Filter input data to only include plays that have outputs
print("Filtering to plays with labels...")
output_plays = train_output[['game_id', 'play_id', 'nfl_id']].drop_duplicates()
output_plays_set = set(output_plays.apply(lambda x: f"{x['game_id']}_{x['play_id']}_{x['nfl_id']}", axis=1))

# Convert to filter for polars
train_input = train_input.filter(
    pl.concat_str([
        pl.col("game_id").cast(pl.Utf8),
        pl.lit("_"),
        pl.col("play_id").cast(pl.Utf8),
        pl.lit("_"),
        pl.col("nfl_id").cast(pl.Utf8)
    ]).is_in(list(output_plays_set))
)
print(f"  Filtered input shape: {train_input.shape}")

# Feature engineering
print("Feature engineering...")
temporal_agg = calculate_temporal_aggregates(train_input)

const_cols = ["game_id", "play_id", "nfl_id", "num_frames_output",
              "ball_land_x", "ball_land_y", "absolute_yardline_number", "player_weight"]
const_cols = [c for c in const_cols if c in train_input.columns]
constant_features = train_input.group_by(["game_id", "play_id", "nfl_id"]).first().select(const_cols)

df_temp = add_temporal_features(train_input.clone())
target_info = get_target_receiver_info(df_temp)

cat_cols = ["player_position", "player_role", "player_side", "play_direction"]
cat_cols = [c for c in cat_cols if c in train_input.columns]
if cat_cols:
    cat_df = train_input.group_by(["game_id", "play_id", "nfl_id"]).first()
    cat_df = cat_df.select(["game_id", "play_id", "nfl_id"] + cat_cols)
    cat_encoded, encoders = encode_categoricals(cat_df, cat_cols)
    with open(os.path.join(Config.PROCESSED_DIR, "categorical_encoders.pkl"), 'wb') as f:
        pickle.dump(encoders, f)

# Merge features
features = temporal_agg
features = features.join(constant_features, on=["game_id", "play_id", "nfl_id"], how="left")
features = features.join(cat_encoded, on=["game_id", "play_id", "nfl_id"], how="left")
if not target_info.is_empty():
    features = features.join(target_info, on=["game_id", "play_id"], how="left")

# Add relative features
if all(c in features.columns for c in ['ball_land_x', 'ball_land_y', 'x_last', 'y_last']):
    features = features.with_columns([
        ((pl.col("ball_land_x") - pl.col("x_last")).pow(2) +
         (pl.col("ball_land_y") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_ball_landing"),
        pl.arctan2(pl.col("ball_land_y") - pl.col("y_last"),
                  pl.col("ball_land_x") - pl.col("x_last")).alias("angle_to_ball_landing")
    ])
if all(c in features.columns for c in ['target_x_last', 'target_y_last', 'x_last', 'y_last']):
    features = features.with_columns([
        ((pl.col("target_x_last") - pl.col("x_last")).pow(2) +
         (pl.col("target_y_last") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_target"),
        pl.arctan2(pl.col("target_y_last") - pl.col("y_last"),
                  pl.col("target_x_last") - pl.col("x_last")).alias("angle_to_target")
    ])

features_df = features.to_pandas()

# Process labels - they already have frame_id structure
print("Processing labels...")
train_output_renamed = train_output.rename(columns={'x': 'target_x', 'y': 'target_y'})
train_output_renamed['frame_offset'] = train_output_renamed['frame_id']

# Merge with features - this is where we need to be careful
print("Merging features with labels...")
print(f"  Features shape before merge: {features_df.shape}")
print(f"  Labels shape before merge: {train_output_renamed.shape}")

final_df = train_output_renamed.merge(
    features_df, 
    on=['game_id', 'play_id', 'nfl_id'], 
    how='inner'
)
print(f"  Merged shape: {final_df.shape}")

# Add time features
if 'num_frames_output' in final_df.columns:
    final_df['T'] = final_df['num_frames_output'].clip(lower=1.0)
else:
    final_df['T'] = 5.0
final_df['time_offset'] = final_df['frame_offset'] / 10.0
final_df['t_rel'] = final_df['frame_offset'] / final_df['T']

# Calculate displacement targets
if 'x_last' in final_df.columns:
    final_df['dx'] = final_df['target_x'] - final_df['x_last']
    final_df['dy'] = final_df['target_y'] - final_df['y_last']
else:
    print("WARNING: x_last not found in features!")
    final_df['dx'] = 0
    final_df['dy'] = 0

final_df = final_df.dropna(subset=['dx', 'dy'])
print(f"  Final training data shape: {final_df.shape}")

# Only proceed if we have training data
if len(final_df) > 0:
    print("\nTraining models...")
    exclude = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'frame_offset',
               'dx', 'dy', 'target_x', 'target_y',
               'player_position', 'player_role', 'player_side', 'play_direction']

    feature_cols = [c for c in final_df.columns if c not in exclude and 
                   final_df[c].dtype in ['float64', 'float32', 'int64', 'int32']]

    print(f"  Using {len(feature_cols)} features")

    # Save feature lists
    feature_lists = {'all_features': feature_cols}
    with open(os.path.join(Config.PROCESSED_DIR, "feature_lists.pkl"), 'wb') as f:
        pickle.dump(feature_lists, f)

    X = final_df[feature_cols].values
    y = final_df[['dx', 'dy']]

    # Cross-validation
    if 'game_id' in final_df.columns and Config.USE_GROUP_KFOLD:
        groups = final_df['game_id'].values
        cv = GroupKFold(n_splits=Config.N_FOLDS)
        splits = list(cv.split(X, y, groups))
    else:
        cv = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
        splits = list(cv.split(X, y))

    for fold, (train_idx, val_idx) in enumerate(splits):
        print(f"  Fold {fold+1}/{Config.N_FOLDS}")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        for algo in Config.ALGORITHMS:
            for target in ['dx', 'dy']:
                if algo == 'lightgbm':
                    model = lgb.LGBMRegressor(**Config.LGBM_PARAMS)
                    model.fit(X_train, y_train[target].values,
                             eval_set=[(X_val, y_val[target].values)],
                             callbacks=[lgb.early_stopping(Config.EARLY_STOPPING_ROUNDS, verbose=False)])
                elif algo == 'xgboost':
                    model = XGBRegressor(**Config.XGB_PARAMS)
                    model.fit(X_train, y_train[target].values,
                             eval_set=[(X_val, y_val[target].values)],
                             early_stopping_rounds=Config.EARLY_STOPPING_ROUNDS, verbose=False)
                elif algo == 'catboost':
                    model = CatBoostRegressor(**Config.CAT_PARAMS)
                    model.fit(X_train, y_train[target].values,
                             eval_set=[(X_val, y_val[target].values)],
                             early_stopping_rounds=Config.EARLY_STOPPING_ROUNDS,
                             use_best_model=True, verbose=False)
                
                path = os.path.join(Config.MODEL_DIR, algo, f"fold_{fold}", f"model_{target}.pkl")
                with open(path, 'wb') as f:
                    pickle.dump(model, f)
        gc.collect()

    # Save metadata
    metadata = {'n_folds': Config.N_FOLDS, 'n_features': len(feature_cols)}
    with open(os.path.join(Config.MODEL_DIR, "training_metadata.json"), 'w') as f:
        json.dump(metadata, f, indent=2)

    print("Training complete!")

    # ========== INFERENCE PIPELINE ==========
    print("\nPhase 2: Generating Predictions")
    print("-"*60)

    # Process test data
    test_input_path = os.path.join(Config.DATA_DIR, "test_input.csv")
    test_template_path = os.path.join(Config.DATA_DIR, "test.csv")

    df_test_in = pl.read_csv(test_input_path)
    test_template = pd.read_csv(test_template_path)
    print(f"  Test input: {df_test_in.shape}, Template: {test_template.shape}")

    # Apply same processing as training
    constant_from_input = df_test_in.group_by(["game_id", "play_id", "nfl_id"]).first().select([
        "game_id", "play_id", "nfl_id", "num_frames_output", "ball_land_x", "ball_land_y", 
        "absolute_yardline_number", "player_weight"
    ])

    df_test_template = pl.from_pandas(test_template)
    df_test_template = df_test_template.join(
        constant_from_input.select(["game_id", "play_id", "nfl_id", "num_frames_output", "player_weight"]), 
        on=["game_id", "play_id", "nfl_id"], how="left"
    )

    df_test_template = df_test_template.with_columns([
        pl.col("frame_id").cast(pl.Float64).alias("frame_offset"),
        pl.col("num_frames_output").cast(pl.Float64).clip(1.0, None).alias("T")
    ])
    df_test_template = df_test_template.with_columns([
        (pl.col("frame_offset") / 10.0).alias("time_offset"),
        (pl.col("frame_offset") / pl.col("T")).alias("t_rel")
    ])

    temporal_agg_test = calculate_temporal_aggregates(df_test_in)
    constant_features_test = constant_from_input.select([
        "game_id", "play_id", "nfl_id", "ball_land_x", "ball_land_y", "absolute_yardline_number"
    ])

    df_temporal_for_target_test = add_temporal_features(df_test_in.clone())
    target_receiver_info_test = get_target_receiver_info(df_temporal_for_target_test)

    categorical_df_test = df_test_in.group_by(["game_id", "play_id", "nfl_id"]).first()
    if cat_cols:
        categorical_df_test = categorical_df_test.select(["game_id", "play_id", "nfl_id"] + cat_cols)
        categorical_encoded_test, _ = encode_categoricals(categorical_df_test, cat_cols, fit=False, encoders=encoders)

    test_processed = df_test_template
    test_processed = test_processed.join(temporal_agg_test, on=["game_id", "play_id", "nfl_id"], how="left")
    test_processed = test_processed.join(constant_features_test, on=["game_id", "play_id", "nfl_id"], how="left") 
    test_processed = test_processed.join(categorical_encoded_test, on=["game_id", "play_id", "nfl_id"], how="left")
    if not target_receiver_info_test.is_empty():
        test_processed = test_processed.join(target_receiver_info_test, on=["game_id", "play_id"], how="left")

    # Add relative features
    if all(col in test_processed.columns for col in ['ball_land_x', 'ball_land_y', 'x_last', 'y_last']):
        test_processed = test_processed.with_columns([
            ((pl.col("ball_land_x") - pl.col("x_last")).pow(2) + 
             (pl.col("ball_land_y") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_ball_landing"),
            pl.arctan2(pl.col("ball_land_y") - pl.col("y_last"), 
                      pl.col("ball_land_x") - pl.col("x_last")).alias("angle_to_ball_landing")
        ])
    if all(col in test_processed.columns for col in ['target_x_last', 'target_y_last', 'x_last', 'y_last']):
        test_processed = test_processed.with_columns([
            ((pl.col("target_x_last") - pl.col("x_last")).pow(2) + 
             (pl.col("target_y_last") - pl.col("y_last")).pow(2)).sqrt().alias("distance_to_target"),
            pl.arctan2(pl.col("target_y_last") - pl.col("y_last"), 
                      pl.col("target_x_last") - pl.col("x_last")).alias("angle_to_target")
        ])

    test_df = test_processed.to_pandas()

    # Make predictions
    available_features = [feat for feat in feature_cols if feat in test_df.columns]
    X_test = test_df[available_features]

    predictions = {}
    for algo in Config.ALGORITHMS:
        pred_dx_folds, pred_dy_folds = [], []
        for fold in range(Config.N_FOLDS):
            with open(os.path.join(Config.MODEL_DIR, algo, f"fold_{fold}", "model_dx.pkl"), 'rb') as f:
                model_dx = pickle.load(f)
            with open(os.path.join(Config.MODEL_DIR, algo, f"fold_{fold}", "model_dy.pkl"), 'rb') as f:
                model_dy = pickle.load(f)
            pred_dx_folds.append(model_dx.predict(X_test))
            pred_dy_folds.append(model_dy.predict(X_test))
        predictions[algo] = {
            'dx': np.mean(pred_dx_folds, axis=0),
            'dy': np.mean(pred_dy_folds, axis=0)
        }

    # Ensemble predictions
    algorithms_to_use = ['lightgbm', 'xgboost', 'catboost']  # Use all three
    ensemble_dx = np.mean([predictions[algo]['dx'] for algo in algorithms_to_use], axis=0)
    ensemble_dy = np.mean([predictions[algo]['dy'] for algo in algorithms_to_use], axis=0)

    # Create submission
    test_df['x'] = test_df['x_last'] + ensemble_dx
    test_df['y'] = test_df['y_last'] + ensemble_dy
    test_df['id'] = (test_df['game_id'].astype(str) + "_" +
                    test_df['play_id'].astype(str) + "_" +
                    test_df['nfl_id'].astype(str) + "_" +
                    test_df['frame_id'].astype(str))

    submission = test_df[['id', 'x', 'y']]
    submission.to_csv("/kaggle/working/submission.csv", index=False)

    print(f"✓ Submission saved: /kaggle/working/submission.csv")
    print(f"✓ Shape: {submission.shape}")
    print(f"\nSample predictions:")
    print(submission.head())
    print("\n✅ Ready to submit to Kaggle!")
else:
    print("ERROR: No training data after processing!")