In [9]:
# Cell 1: Imports and Setup
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.stats import entropy
import warnings
warnings.filterwarnings('ignore')

# Set style for all plots
plt.style.use('seaborn-v0_8')
sns.set_theme()
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]

In [10]:
# Cell 2: Data Loading and Initial Exploration
# Get path to data files
data_dir = os.path.join(os.path.dirname(os.getcwd()), '')

def load_all_tracking_data(weeks=range(1, 9)):
    tracking_data = []
    for week in weeks:
        file_path = os.path.join(data_dir, f'tracking_week_{week}.csv')
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            tracking_data.append(df)
    return pd.concat(tracking_data, ignore_index=True)

# Load all required data
tracking_df = load_all_tracking_data()
plays_df = pd.read_csv(os.path.join(data_dir, 'plays.csv'))
games_df = pd.read_csv(os.path.join(data_dir, 'games.csv'))
players_df = pd.read_csv(os.path.join(data_dir, 'players.csv'))

# Display column names to verify our data structure
print("Tracking Data Columns:", tracking_df.columns.tolist())
print("\nPlays Data Columns:", plays_df.columns.tolist())

Tracking Data Columns: ['gameId', 'playId', 'nflId', 'displayName', 'frameId', 'frameType', 'time', 'jerseyNumber', 'club', 'playDirection', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event']

Plays Data Columns: ['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'playNullifiedByPenalty', 'absoluteYardlineNumber', 'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability', 'expectedPoints', 'offenseFormation', 'receiverAlignment', 'playClockAtSnap', 'passResult', 'passLength', 'targetX', 'targetY', 'playAction', 'dropbackType', 'dropbackDistance', 'passLocationType', 'timeToThrow', 'timeInTackleBox', 'timeToSack', 'passTippedAtLine', 'unblockedPressure', 'qbSpike', 'qbKneel', 'qbSneak', 'rushLocationType', 'penaltyYards', 'prePenaltyYardsGained', 'yardsGained', 'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded', 'expectedPoin

In [12]:
# Cell 3: Data Preprocessing
# First, let's examine our data structure
print("Tracking Data Columns:", tracking_df.columns.tolist())
print("\nPlays Data Columns:", plays_df.columns.tolist())

def get_pre_snap_data(tracking_df, plays_df, pre_snap_window=10):
    # Merge tracking data with plays data
    merged_df = tracking_df.merge(
        plays_df[['gameId', 'playId', 'possessionTeam']], 
        on=['gameId', 'playId']
    )
    
    # Filter to defensive players
    defense_df = merged_df[merged_df['club'] != merged_df['possessionTeam']]
    
    # Get pre-snap frames using snap column (might be named differently)
    # Let's first check what frame-related columns we have
    frame_columns = [col for col in defense_df.columns if 'frame' in col.lower()]
    print("\nFrame-related columns:", frame_columns)
    
    # For now, let's just return the defense_df to check its structure
    return defense_df

# Process the data
pre_snap_df = get_pre_snap_data(tracking_df, plays_df)

# Show a sample of the data
print("\nSample of merged data:")
print(pre_snap_df.head())

Tracking Data Columns: ['gameId', 'playId', 'nflId', 'displayName', 'frameId', 'frameType', 'time', 'jerseyNumber', 'club', 'playDirection', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event']

Plays Data Columns: ['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'playNullifiedByPenalty', 'absoluteYardlineNumber', 'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability', 'expectedPoints', 'offenseFormation', 'receiverAlignment', 'playClockAtSnap', 'passResult', 'passLength', 'targetX', 'targetY', 'playAction', 'dropbackType', 'dropbackDistance', 'passLocationType', 'timeToThrow', 'timeInTackleBox', 'timeToSack', 'passTippedAtLine', 'unblockedPressure', 'qbSpike', 'qbKneel', 'qbSneak', 'rushLocationType', 'penaltyYards', 'prePenaltyYardsGained', 'yardsGained', 'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded', 'expectedPoin

In [13]:
# Cell 4: Feature Calculation
def calculate_defensive_features(group):
    """Calculate defensive formation features for a group of players"""
    # Basic count features
    n_defenders = len(group)
    
    # Distance from line of scrimmage
    los_distances = group['y'].abs()
    avg_los_distance = los_distances.mean()
    box_count = (los_distances <= 8).sum()
    
    # Horizontal spread
    defensive_width = group['x'].max() - group['x'].min()
    
    # Spacing calculations
    positions = group[['x', 'y']].values
    distances = []
    for i in range(len(positions)):
        for j in range(i + 1, len(positions)):
            dist = np.linalg.norm(positions[i] - positions[j])
            distances.append(dist)
    
    avg_spacing = np.mean(distances) if distances else 0
    spacing_std = np.std(distances) if distances else 0
    
    return pd.Series({
        'n_defenders': n_defenders,
        'avg_los_distance': avg_los_distance,
        'box_count': box_count,
        'defensive_width': defensive_width,
        'avg_spacing': avg_spacing,
        'spacing_std': spacing_std
    })

# Calculate features for each play
play_features = pre_snap_df.groupby(['gameId', 'playId']).apply(calculate_defensive_features)

# Display sample results
print("Sample of calculated features:")
print(play_features.head())

KeyboardInterrupt: 

In [None]:
# Cell 5: Calculate DAVI Components
def calculate_formation_entropy(play_features):
    """Calculate entropy of formation types"""
    formations = pd.cut(play_features['box_count'], 
                       bins=[0, 5, 6, 7, 100],
                       labels=['Light Box', 'Standard', 'Heavy Box', 'Goal Line'])
    probs = formations.value_counts(normalize=True)
    return entropy(probs)

def calculate_spacing_versatility(play_features):
    """Calculate versatility score based on spacing"""
    return play_features['spacing_std'].std() / play_features['spacing_std'].mean()

def calculate_transition_rate(pre_snap_df):
    """Calculate rate of defensive position changes"""
    def count_changes(group):
        position_changes = ((group['x'].diff().abs() > 1) | 
                          (group['y'].diff().abs() > 1)).sum()
        return position_changes / len(group)
    
    return pre_snap_df.groupby(['gameId', 'playId', 'nflId']).apply(count_changes).mean()

# Calculate DAVI components
davi_components = {
    'formation_entropy': calculate_formation_entropy(play_features),
    'spacing_versatility': calculate_spacing_versatility(play_features),
    'transition_rate': calculate_transition_rate(pre_snap_df)
}

print("DAVI Components:")
for component, value in davi_components.items():
    print(f"{component}: {value:.3f}")

In [None]:
# Cell 6: Team-Level Analysis
def calculate_team_metrics(pre_snap_df, play_features):
    """Calculate defensive metrics for each team"""
    team_stats = {}
    
    for team in pre_snap_df['club'].unique():
        team_plays = pre_snap_df[pre_snap_df['club'] == team]
        team_features = play_features[play_features.index.isin(
            team_plays[['gameId', 'playId']].drop_duplicates().values.tolist()
        )]
        
        team_stats[team] = {
            'formation_entropy': calculate_formation_entropy(team_features),
            'spacing_versatility': calculate_spacing_versatility(team_features),
            'transition_rate': calculate_transition_rate(team_plays)
        }
    
    return pd.DataFrame(team_stats).T

# Calculate team metrics
team_metrics = calculate_team_metrics(pre_snap_df, play_features)

print("Team Metrics Sample:")
print(team_metrics.head())