### Imports

In [1]:
import pandas as pd
import numpy as np
import time
import os
import json
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.collections import PatchCollection
from matplotlib.animation import FuncAnimation
from scipy.spatial import Voronoi, cKDTree
from concurrent.futures import ProcessPoolExecutor, as_completed

# imports from the utils.py script
import utils as NFLUtils

### Methods

In [None]:
def load_game_data(tracking_file_path: str, plays_file_path: str, game_id: int, chunk_size:int = 10000)->pd.DataFrame:
    """
    Load rows from a CSV file that match a specific gameID

    Args:
    file_path (str): Path to the CSV file
    game_id (int): the gameID to filter by
    chunk_size (int, optional): the number of rows per chunk, default 10000

    Returns:
    pd.DataFrame: a DataFrame containing rows with the specified gameID
    """
    data = pd.DataFrame()
    # stream data in chunks
    for chunk in pd.read_csv(tracking_file_path, chunksize=chunk_size):
        filtered_chunk = chunk[chunk['gameId'] == game_id]
        # when no more matches, don't parse the rest of the file
        if filtered_chunk.shape[0] == 0:
            break
        data = pd.concat([data, filtered_chunk], ignore_index=True)
    plays_df = pd.read_csv(plays_file_path)
    data = pd.merge(data, plays_df[['gameId', 'playId', 'possessionTeam', 'ballCarrierId']], on=['gameId', 'playId'])
    data = data.loc[data['club'] != 'football']
    data['is_offense'] = (data['possessionTeam'] == data['club'])
    return data

In [None]:
def organize_game_data(df: pd.DataFrame)->dict:
    """
    Organize game data into a nested dictionary structure.

    Args:
    df (pd.DataFrame): The DataFrame containing game data.

    Returns:
    dict: A nested dictionary with plays as keys and dictionaries of data where the key is the frame and the values are data from that frame
    """

    # Initialize the main dictionary
    game_dict = {}

    # Iterate over each unique play in the DataFrame
    for play_id in df['playId'].unique():

        play_df = df[df['playId'] == play_id]
        play_events = play_df['event'].unique()


        #for now, ignoring fumbles, but maybe later on we can count that as a tackle?
        if 'fumble' in play_events:
          continue
        
        play_df = play_df.copy()
        if play_df['playDirection'].iloc[0] == 'left':
          play_df['x'] = 120 - play_df['x']
          play_df['y'] = 53.3 - play_df['y']


        # Initialize the play's dictionary
        play_dict = {}

        start_frame = 1
        #another potentiall type of event to include is 'run', but for now i'm excluding that
        #because I'm not exactly sure what it means
        if 'pass_outcome_caught' in play_events:
          start_frame = play_df.loc[play_df['event'] == 'pass_outcome_caught']['frameId'].min()
        elif 'handoff' in play_events:
          start_frame = play_df.loc[play_df['event'] == 'handoff']['frameId'].min()
        else:
          continue

        #this limits us to plays where a tackle is made
        #not sure if we need special consideration for when a runner scores, so those plays are ignored for now
        #potentially could include 'out_of_bounds' and factor that into defensive play as well
        end_frame = 1
        if 'tackle' in play_events:
          end_frame = play_df.loc[play_df['event'] == 'tackle']['frameId'].min()
        else:
          continue

        # Iterate over each player in the play
        for frame_id in play_df['frameId'].unique():
            if (frame_id < start_frame) or (frame_id > end_frame):
              continue
            frame_df = play_df[play_df['frameId'] == frame_id]

            # Select and sort relevant columns
            columns = ['nflId', 'time', 'playDirection', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'event', 'is_offense', 'ballCarrierId']
            frame_df = frame_df[columns]
            frame_df = frame_df.astype({'nflId': int, 'ballCarrierId': int})
            
            # Add the player's DataFrame to the play's dictionary
            play_dict[frame_id] = frame_df

        # Add the play's dictionary to the main dictionary
        game_dict[play_id] = play_dict

    return game_dict

In [None]:
def generate_color_map(nfl_ids):
    """
    Generates a color map for given NFL IDs.

    Parameters:
    - nfl_ids: List of unique NFL IDs.

    Returns:
    - Dictionary mapping each NFL ID to a color.
    """
    nfl_ids = nfl_ids.dropna().unique()
    colors = plt.cm.rainbow(np.linspace(0, 1, len(nfl_ids)))
    color_map = {nfl_id: color for nfl_id, color in zip(nfl_ids, colors)}
    return color_map

In [None]:
def assign_squares_to_players_OLD(frame_data, x_min=0, x_max=120, y_min=0, y_max=53.3):
    # TODO: MADE THIS PRETTY SLOW. We can cache the results of the find_closest_player function embedded to make this faster down the line
    """
    DEPRACATED. UPDATE METHOD BELOW
    Assigns each 1-yard square of a football field to the nearest player.

    Parameters:
    - frame_data (pd.DataFrame): DataFrame with columns ['nflId', 'x', 'y'] representing players' positions.
    - x_min, x_max (float): Optional. The minimum and maximum x-coordinates (in yards) of the field area to consider.(0-120 yards)
    - y_min, y_max (float): Optional. The minimum and maximum y-coordinates (in yards) of the field area to consider.(0-53.3 yards)

    Returns:
    - A DataFrame with columns ['square_x', 'square_y', 'closest_player_id'].
    """
    
    # Generate all 1-yard squares within specified limits
    x_range = np.arange(x_min, x_max + 1, 1)
    y_range = np.arange(y_min, y_max + 1, 1)
    squares = pd.DataFrame([(x, y) for x in x_range for y in y_range], columns=['square_x', 'square_y'])

    # Function to find the closest player for a given square
    def find_closest_player(square_x, square_y):
        frame_data['distance'] = np.sqrt((frame_data['x'] - square_x) ** 2 + (frame_data['y'] - square_y) ** 2)
        return frame_data.loc[frame_data['distance'].idxmin()]['nflId']

    # Assign each square to the closest player
    squares['closest_player_id'] = squares.apply(lambda row: find_closest_player(row['square_x'], row['square_y']), axis=1)

    # Drop the temporary distance column from frame_data
    frame_data.drop(columns=['distance'], inplace=True)

    return squares

In [None]:
def visualize_field(player_assignments, color_map, min_x=0, max_x=120, min_y=0, max_y=53.3):
    """
    DEPRACATED, I'm creating animations with the create_animation method instead
    Visualizes the football field with each 1-yard square colored based on the nearest player.
    Squares with no assigned player are left blank (or can be assigned a default color).

    Parameters:
    - player_assignments (pd.DataFrame): DataFrame with columns ['square_x', 'square_y', 'closest_player_id'].
    - color_map (dict): maps each NFL ID to a color, get this using the generate_color_map method
    - min_x (float): the min x value in the graph (long side of football field, 0-120)
    - max_x (float): the max x
    - min_y (float): the min y value in the graph (short axis of football field, 0-53.3)
    - max_y (float): the max y
    
    Return: 
    - fig
    - ax
    """
    start_time = time.time()

    # Create a figure and axis for the plot
    fig, ax = plt.subplots(figsize=(12, 6))
    setup_time = time.time()
    print("Setup Time: ", setup_time - start_time)

    # Set up the field dimensions
    ax.set_xlim(min_x, max_x)
    ax.set_ylim(min_y, max_y)
    
    set_dimensions_time = time.time()
    print("Set Dimensions Time: ", set_dimensions_time - setup_time)

    # Create a list to hold all the rectangles
    rectangles = []

    # Create a list to hold the colors of each rectangle
    rectangle_colors = []

    for _, row in player_assignments.iterrows():
        player_id = row['closest_player_id']
        square_color = color_map.get(player_id, 'grey')
        rect = patches.Rectangle((row['square_x'] - 0.5, row['square_y'] - 0.5), 1, 1)
        rectangles.append(rect)
        rectangle_colors.append(square_color)

    # Create a PatchCollection and add it to the axis
    pc = PatchCollection(rectangles, facecolor=rectangle_colors, edgecolor=None)
    ax.add_collection(pc)

    plotting_time = time.time()
    print("Plotting Time: ", plotting_time - set_dimensions_time)

    # Additional plot settings
    ax.set_xlabel('Yards (X-axis)')
    ax.set_ylabel('Yards (Y-axis)')
    ax.set_title('Bucketed Voronoi')

    end_time = time.time()
    print("Total Execution Time: ", end_time - start_time)

    plt.show()
    return fig, ax



In [None]:
def assign_squares_to_players(frame_data, x_min=0, x_max=120, y_min=0, y_max=53.3, x_step=1, y_step=1):
    """
    Assigns each x_step by y_step square of a football field to the nearest player using Voronoi tessellation.

    Parameters:
    - frame_data (pd.DataFrame): DataFrame with columns ['nflId', 'x', 'y'] representing players' positions.
    - x_min, x_max (float): Optional. The minimum and maximum x-coordinates (in yards) of the field area to consider.(0-120 yards)
    - y_min, y_max (float): Optional. The minimum and maximum y-coordinates (in yards) of the field area to consider.(0-53.3 yards)
    - x_step, y_step (float): Optional. The size of each Voronoi bucket, defaults to 1 yd by 1 yd

    Returns:
    - A DataFrame with columns ['square_x', 'square_y', 'closest_player_id', 'ball_carrier', 'is_offense'].
    """
    # modify the frame_data such that ever offensive player gets the ballCarrierId (assume they share voronoi space)
    # commenting this out for the moment because it wasn't helping the analysis, but in the future, make it such that if they're touching the space of another offensive player they become one unit
    ball_carrier = frame_data.ballCarrierId.iloc[0]
    # frame_data.loc[frame_data.is_offense == True, 'nflId'] = ball_carrier

    # Generate Voronoi diagram
    points = frame_data[['x', 'y']].values
    vor = Voronoi(points)
    # fig = voronoi_plot_2d(vor)
    # plt.show()  # for debug purposes 

    # Generate all 1-yard squares within specified limits
    x_range = np.arange(x_min, x_max + x_step, x_step)
    y_range = np.arange(y_min, y_max + y_step, y_step)
    squares = pd.DataFrame([(x, y) for x in x_range for y in y_range], columns=['square_x', 'square_y'])

    # Create a KDTree for efficient nearest neighbor search
    tree = cKDTree(points)

    # Assign each square to the closest player based on Voronoi regions
    squares['closest_player_id'] = squares.apply(lambda row: frame_data.iloc[tree.query((row['square_x'], row['square_y']))[1]]['nflId'], axis=1)
    # get the ID of the ball carrier
    squares['ball_carrier'] = ball_carrier

    return squares


In [None]:
def voronoi_area(squares: pd.DataFrame, weights: pd.DataFrame=None):
    """
    Return the area attributed to each unique player by nflID

    Params: 
    - squares (pd.DataFrame): a dataframe with columns ['square_x', 'square_y', 'closest_player_id'].
    - weights (pd.DataFrame): 

    Returns: 
    - a dictionary with keys of closest_player_id and values of the voronoi areas, in square yards (we can modify this later with the weights)
    """
    # this is the case where we weight each Voronoi bin differently -- we can implement this later
    if weights: 
        return 
    else:
        voronoi_areas = squares.groupby('closest_player_id').size().to_dict()
    
    return voronoi_areas


In [None]:
def tackle_percentage_contribution_per_frame(frame_data:pd.DataFrame, weights: pd.DataFrame=None, x_step: int=1, y_step: int=1)->dict:
    """ 
    For every unique player attributed to a square on the defending team, take them out and see how much Voronoi area would be gained by the player in possession. 

    Params: 
    - frame_data (pd.DataFrame): a dataframe from the organize_game_data method with columns ['nflId', 'ballCarrierId', 'is_offense', 'x', 'y']
    - weights (pd.DataFrame): 
    - x_step (int): the x-side of the voronoi bins when caling the assign_squres_to_players method
    - y_step (int): the y-side of the voronoi bins when caling the assign_squres_to_players method

    Returns: 
    - dictionary with keys of nflId and value of the tackle percentage contribution for that frame
    """

    # if there is a valid set of weights [FINISH THIS]
    if weights: 
        return
    
    else: 
        area_protected = {}
        # get the ball carrier and offensive players
        ball_carrier = frame_data.ballCarrierId.iloc[0]
        offensive_players = dict(zip(frame_data.nflId, frame_data.is_offense))

        # get the minimum x, after which we will cut off voronoi analysis
        x_min = max(10, frame_data.loc[frame_data.nflId==ball_carrier, 'x'].iloc[0] - 10) # we end the voronoi tesselation 10 yards behind the ball carrier or 10, whichever is greater
        squares = assign_squares_to_players(frame_data, x_min=x_min, x_step=x_step, y_step=y_step)
        baseline_area = voronoi_area(squares)[ball_carrier]
        
        for player_id in squares.closest_player_id.unique(): 
            # break for the ball_carrier
            if offensive_players[player_id]: 
                continue
            # take the frame data if that player didn't exist
            filtered_frame_data = frame_data[frame_data.nflId != player_id]
            # calculate how much additional space the offense gets
            voronoi_filtered = assign_squares_to_players(filtered_frame_data, x_min=x_min, x_step=x_step, y_step=y_step)
            protected_areas = voronoi_area(voronoi_filtered)[ball_carrier]
            area_protected[player_id] = protected_areas - baseline_area  # how much more area do they get?
    
    # divide by the total sum of the frame to get tackle percentage contribution in each frame
    # I'm unconvinced this is the correct approach and I'm commenting out out for now, we can talk about this
    # Basically, if no one is close to the player on offense, I think this will be misleading
    # total_protected_area = sum(area_protected.values())
    # for key, value in area_protected.items(): 
    #     area_protected[key] = value / total_protected_area

    return area_protected



In [None]:
def euclidean_distance_per_frame(frame_data:pd.DataFrame)->dict: 
    """ 
    Params: 
    - frame_data (pd.DataFrame): a dataframe from the organize_game_data method with columns ['nflId', 'ballCarrierId', 'is_offense', 'x', 'y']
    Returns: 
    - distance_dict (dict): a dict where the keys are the player IDs and the values are the distances
    """
    ball_carrier = frame_data.ballCarrierId.iloc[0]
    x, y = zip(frame_data.loc[ball_carrier, ['x', 'y']])
    defense = frame_data[~frame_data.isOffense].nflId
    distances = [sqrt((x-x_d)**2 + (y-y_d)**2) for x_d, y_d in frame_data[~frame_data.isOffense].x, frame_data[~frame_data.isOffense].y]
    distance_dict = dict(zip(defense, distances))

    return distance_dict

In [None]:
def euclidean_distance_per_play(frame_dict:dict, filepath:str)->dict: 
    """ 
    Calculate the Euclidean distances of each of the defenders from the ball
    Params: 
    - frame_dict: dict from the organize_game_data method for each play
    - filepath: the path of each play, under which we can cache the data
    """
    frame_distances = {}
    # sort the frames
    frame_dict_sorted = sorted(frame_dict.items(), key=lambda x: x[0])
    # iterate through the frames of the play
    for key, frame in frame_dict_sorted: 
        frame_distances[key] = euclidean_distance_per_frame(frame)

    # Convert the dictionary with the frame data to a DataFrame to cache
    # The keys of the outer dict become the index, and the inner dicts' keys become the column names
    frame_distances_df = pd.DataFrame.from_dict(frame_distances, orient='index')

    # Save to CSV, with the index to make future multiplication easier
    frame_distances_df.to_csv(f'{filepath}/distances_per_frame.csv', index=True)
    
    return frame_distances

In [None]:
def analyze_game_distances(game_id, tracking_file, plays_file='./data/plays.csv', game_file='./data/games.csv')->None:
    """ 
    A method to cache the distances of the players from the ball at all times
    Param: 
    - game_id (int): the ID of the game as found in the Kaggle cleaned data
    - tracking_file (str): the address of the file in which the tracking data is stored
    - plays_file (str): the address of the plays file
    - game_file (str): the filepath of the file containing information about each game
    """
    
    games = pd.read_csv(game_file)
    game_data = games[games.gameId==game_id].iloc[0, [0, 5, 6]] # pull the ID (col 0), home team (col 5), visitng team (col 6)
    filepath = f'./games/{game_data.iloc[0]}_{game_data.iloc[1]}_{game_data.iloc[2]}'

    # Create a directory for the game if none exists
    if not os.path.exists(filepath):
        os.makedirs(filepath)

    # Sort and organize the data
    game_data_organized = organize_game_data(load_game_data(tracking_file, plays_file, game_id))
    sorted_game_data_organized = sorted(game_data_organized.items(), key=lambda x: x[0])

    # Using ProcessPoolExecutor to parallelize the loop
    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(euclidean_distance_per_play, play, f'{filepath}/{key}') for key, play in sorted_game_data_organized]


In [None]:
def tackle_percentage_contribution_per_play(frame_dict:dict, filepath:str, weights:pd.DataFrame=None, x_step:int=1, y_step:int=1): 
    """
    This iterates through the frames in any given play and calculates the tackle percentage contribution of each player
    TODO: FIX THE CACULATION OF THE WEIGHTS
    
    Params: 
    - squares (pd.DataFrame): a dataframe with columns ['square_x', 'square_y', 'closest_player_id'].
    - weights (pd.DataFrame): 
    - x_step (int): the x-side of the voronoi bins when caling the assign_squres_to_players method
    - y_step (int): the y-side of the voronoi bins when caling the assign_squres_to_players method

    Returns: 
    - dictionary with keys of nflId and value of the tackle percentage contribution for that play
    """
    # empty dict, one indexed by player, the other indexed by frame
    total_tpc = {}
    tpc_per_frame = {}

    # sort the frames
    frame_dict_sorted = sorted(frame_dict.items(), key=lambda x: x[0])
    # iterate through the frames of the play
    for key, frame in frame_dict_sorted: 

        # get protected areas, append to both dictionaries
        frame_tpc = tackle_percentage_contribution_per_frame(frame, weights, x_step, y_step)
        tpc_per_frame[key] = frame_tpc

        # append to the overall dict for the play
        for player, contribution in frame_tpc.items():
            if player in total_tpc.keys(): 
                total_tpc[player] += contribution
            else: 
                total_tpc[player] = contribution
    
    # normalize every player's contribution such that it sums to 1
    total_protected_area = sum(total_tpc.values())
    for key, value in total_tpc.items():
        total_tpc[key] = value / total_protected_area

    # Convert the dictionary with the frame data to a DataFrame to cache
    # The keys of the outer dict become the index, and the inner dicts' keys become the column names
    tpc_per_frame_df = pd.DataFrame.from_dict(tpc_per_frame, orient='index')

    # Save to CSV, with the index to make future multiplication easier
    tpc_per_frame_df.to_csv(f'{filepath}/tpc_per_frame.csv', index=True)

    # cast everything to strings from int64 (otherwise cannot store in JSON)
    total_tpc_converted = {str(key): value for key, value in total_tpc.items()}

    # cache this result as a JSON for each play
    json.dump(total_tpc_converted, open(filepath+'/tpc.json', 'w'))

    # create an animation
    create_animation(frame_dict=frame_dict, tpc_per_frame=tpc_per_frame, play_filepath=filepath, x_step=x_step, y_step=y_step)

    return total_tpc

In [None]:
def tackle_percentage_contribution_per_game(game_data_organized:dict, x_step:int=1, y_step:int=1): 
    """
    Iterate through all plays in the game and sum the defensive contribution of each player
    """
    game_tpc = {}
    # sort the plays in the game by the order they happened
    sorted_game_data_organized = sorted(game_data_organized.items(), key=lambda x: x[1])
    for key, play in sorted_game_data_organized: 
        print(key)  # for debugging purposes
        play_tpc = tackle_percentage_contribution_per_play(play)
        # append to the overall dict for the play
        for player, contribution in play_tpc.items():
            if player in game_tpc.keys(): 
                game_tpc[player] += contribution
            else: 
                game_tpc[player] = contribution

    return game_tpc


In [None]:
def analyze_game(game_id:str, 
                 tracking_file:str, 
                 x_step:int=1, 
                 y_step:int=1,
                 plays_file:str='./data/plays.csv', 
                 players_file:str='./data/players.csv', 
                 game_file:str='./data/games.csv'):
    """ 
    Analyze a game by creating a directory to store the results of each play and the relevant animation, and a file of overall TPC
    """
    games = pd.read_csv(game_file)
    game_data = games[games.gameId==game_id].iloc[0, [0, 5, 6]] # pull the date (col 0), home team (col 5), visitng team (col 6)
    filepath = f'./games/{game_data.iloc[0]}_{game_data.iloc[1]}_{game_data.iloc[2]}'

    # make a directory for the game if none exists
    if not os.path.exists(filepath): 
        os.makedirs(filepath)

    # organize the dta from the relevant game
    game_data = load_game_data(tracking_file, plays_file, game_id)
    game_data_organized = organize_game_data(game_data)
    
    # sort the plays in the game by the order they happened
    game_tpc = {}
    sorted_game_data_organized = sorted(game_data_organized.items(), key=lambda x: x[0])

    # iterate through the plays
    for key, play in sorted_game_data_organized: 

        print(key)  #  for debugging purposes
        plt.close('all')  # close all open plots

        try: 
            # make a directory to store information from the play
            play_filepath = filepath + f'/{key}'
            if not os.path.exists(play_filepath): 
                os.makedirs(play_filepath)

            # calculate the tackle_percentage_contribution (this also caches the result as a JSON and creates an animation)
            play_tpc = tackle_percentage_contribution_per_play(frame_dict=play, filepath=play_filepath, x_step=x_step, y_step=y_step)

            # append to the overall dict for the play
            for player, contribution in play_tpc.items():
                if player in game_tpc.keys(): 
                    game_tpc[player] += contribution
                else: 
                    game_tpc[player] = contribution
                    
        except Exception as e: 
            print(key, e)
            continue

    
    # convert game_tpc keys from int64 to string to store in JSON
    game_tpc_converted = {str(key): value for key, value in game_tpc.items()}
    # cache this result as a JSON for each game
    json.dump(game_tpc_converted, open(filepath+'/game_tpc.json', 'w'))

    return game_tpc

    

In [None]:
def analyze_play(key, play, filepath, x_step, y_step):
    """
    Function to analyze a single play. This function will be executed in parallel.
    """
    print(key)  # For debugging purposes

    # Define the play's file path
    play_filepath = f'{filepath}/{key}'
    if not os.path.exists(play_filepath):
        os.makedirs(play_filepath)

    try:
        # Calculate the tackle_percentage_contribution
        # Ensure that the tackle_percentage_contribution_per_play function is defined appropriately
        play_tpc = tackle_percentage_contribution_per_play(frame_dict=play, filepath=play_filepath, x_step=x_step, y_step=y_step)

        return {player: contribution for player, contribution in play_tpc.items()}
    except Exception as e:
        print(f'Error processing play {key}: {e}')
        return {}

def analyze_game(game_id, tracking_file, x_step=1, y_step=1, plays_file='./data/plays.csv', players_file='./data/players.csv', game_file='./data/games.csv'):
    
    games = pd.read_csv(game_file)
    game_data = games[games.gameId==game_id].iloc[0, [0, 5, 6]] # pull the date (col 0), home team (col 5), visitng team (col 6)
    filepath = f'./games/{game_data.iloc[0]}_{game_data.iloc[1]}_{game_data.iloc[2]}'

    # Create a directory for the game if none exists
    if not os.path.exists(filepath):
        os.makedirs(filepath)

    # Sort and organize the data
    game_data_organized = organize_game_data(load_game_data(tracking_file, plays_file, game_id))
    sorted_game_data_organized = sorted(game_data_organized.items(), key=lambda x: x[0])

    # Dictionary to store the overall tackle_percentage_contribution
    game_tpc = {}

    # Using ProcessPoolExecutor to parallelize the loop
    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(analyze_play, key, play, filepath, x_step, y_step) for key, play in sorted_game_data_organized]

        for future in concurrent.futures.as_completed(futures):
            play_tpc = future.result()
            for player, contribution in play_tpc.items():
                game_tpc[player] = game_tpc.get(player, 0) + contribution

    # Convert game_tpc keys from int64 to string to store in JSON
    game_tpc_converted = {str(key): value for key, value in game_tpc.items()}
    # Cache this result as a JSON for each game
    json.dump(game_tpc_converted, open(filepath + '/game_tpc.json', 'w'))

    return game_tpc


In [None]:
def create_animation(frame_dict: dict, tpc_per_frame: dict, play_filepath:str, x_min=0, x_max=120, y_min=0, y_max=53.3, x_step=1, y_step=1):
    """
    Creates an animation of bucketed Voronoi spaces for different frames.

    Parameters:
    - frame_dict: Dictionary of DataFrames indexed by frame, each containing ['player_id', 'x', 'y'].
    - tpc_per_frame (dict): returned from the tackle_percentage_contribution_per_play method that labels the contribution of each defensive player per play, each key is the frame
    - play_filepath (str): the filepath used to save the animation 
    - min_x (float): the min x value in the graph (long side of football field, 0-120)
    - max_x (float): the max x
    - min_y (float): the min y value in the graph (short axis of football field, 0-53.3)
    - max_y (float): the max y
    - frame (int): the frame in question, useful for locating the file
    
    Returns:
    - None

    """
    # assign a color map for all players in the play, based on which players were active in the first frame
    color_map = generate_color_map(frame_dict[sorted(frame_dict.keys())[0]].nflId) # from the first frame, pull all active players

    # open plots were taking too much memory
    plt.close('all')

    # Function to draw a single frame for the animation
    def draw_frame(frame_number):
        
        # Process the frame data to get the assignments
        player_assignments = assign_squares_to_players(frame_dict[frame_number], x_min, x_max, y_min, y_max, x_step, y_step)
        ball_carrier = frame_dict[frame_number].ballCarrierId.iloc[0]

        nonlocal color_map
        ax.clear()
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)

        # Create a list to hold all the rectangles
        rectangles = []

        # Create a list to hold the colors of each rectangle
        rectangle_colors = []

        for _, row in player_assignments.iterrows():

            # plot the colors based on the closest player
            player_id = row['closest_player_id']
            square_color = color_map.get(player_id, 'grey')
            rect = patches.Rectangle((row['square_x'] - 0.5, row['square_y'] - 0.5), 1, 1)
            rectangles.append(rect)
            rectangle_colors.append(square_color)

        # Add labels at centroids
        player_positions = zip(frame_dict[frame_number].nflId, frame_dict[frame_number].is_offense, frame_dict[frame_number].x, frame_dict[frame_number].y)
        for player_id, is_offense, x, y in player_positions:

            # Get tackle percentage contribution, default 0 for offense
            tpc = tpc_per_frame[frame_number].get(player_id, 0) 

            # label the offensive players, red=ball carrier, black=offense, white=defense
            if player_id == ball_carrier: 
                dot_color='red'
            elif is_offense: 
                dot_color='black'
            else: 
                dot_color='white'

            # plot the dot for every player and their TPC
            ax.plot(x, y, marker='o', markersize=5, markerfacecolor=dot_color)
            ax.text(x, y, f'{player_id}: {tpc}', ha='center', va='center', fontsize=9)
        # Create a PatchCollection and add it to the axis
        pc = PatchCollection(rectangles, facecolor=rectangle_colors, edgecolor=None)
        ax.add_collection(pc)

        # Additional plot settings
        ax.set_xlabel('Yards (X-axis)')
        ax.set_ylabel('Yards (Y-axis)')
        ax.set_title(f'Bucketed Voronoi Areas (ball carrier: {ball_carrier})')

    # Create figure and axis for the animation
    fig, ax = plt.subplots(figsize=(24, 12))

    # Create the animation
    anim = FuncAnimation(fig, draw_frame, frames=sorted(frame_dict.keys()), interval=200, repeat=False)

    # To save the animation, uncomment the line below and specify the filename and writer
    anim.save(play_filepath + f'/voronoi_visualizer.mp4', writer='ffmpeg')

    # plt.show()
    # return anim



### Sanity check methods

In [None]:
week_1_tracking_file = './data/tracking_week_1.csv'
plays_file = './data/plays.csv'
game_data = load_game_data(week_1_tracking_file, plays_file, 2022090800)
game_data_organized = organize_game_data(game_data)
print("Number of plays: ", len(game_data_organized.keys()))

# format of the organized data: 
test_frame = game_data_organized[56][7]
test_frame.head(10)

In [None]:
# types of events in any given game
for key, play in game_data_organized.items(): 
    events = []
    for key2, value in play.items(): 
        events += [item for item in value.event]
        print(set(events))
set(events)

In [None]:
# voronoi tesselation
points = np.column_stack([test_frame['x'].to_numpy(), test_frame['y'].to_numpy()])
vor = Voronoi(points)
fig = voronoi_plot_2d(vor)
plt.show()
    

In [None]:
squares = assign_squares_to_players(test_frame, x_step=.5, y_step=.5)
squares.head()

In [None]:
tackle_percentage_contribution_per_frame(test_frame)

In [None]:
color_map = generate_color_map(squares.closest_player_id)
visualize_field(squares, color_map)

In [None]:
test_dict = game_data_organized[56]
create_animation(test_dict, x_min=50, x_max=90, x_step=.3, y_step=.3, frame=56)

In [None]:
game_file = './data/games.csv'
players_file = './data/players.csv'
games = pd.read_csv(game_file)
players = pd.read_csv(players_file)

game_tpc_file = open('./games/2022090800_LA_BUF/game_tpc.json')
game_tpc = json.load(game_tpc_file)
sorted_game_tpc = sorted(game_tpc.items(), key=lambda x: x[1], reverse=True)
labeled_dict = {}

for id, score in sorted_game_tpc: 
    key = players[players.nflId==int(id)].iloc[0].displayName
    labeled_dict[key] = score
json.dump(labeled_dict, open('./games/2022090800_LA_BUF/labeled_game_tpc.json', 'w'))



for player, score in sorted_game_tpc: 
    jugador = players[players.nflId==int(player)].iloc[0]
    print(jugador.displayName, jugador.position, 'TPC over game: ', score)

### Take 1: TPC

In [None]:
week_1_tracking_file = './data/tracking_week_1.csv'
plays_file = './data/plays.csv'
game_data = load_game_data(week_1_tracking_file, plays_file, 2022090800)
game_data_organized = organize_game_data(game_data)

In [None]:
game_tpc = tackle_percentage_contribution_per_game(game_data_organized=game_data_organized)

In [None]:
game_file = './data/games.csv'
players_file = './data/players.csv'
games = pd.read_csv(game_file)
players = pd.read_csv(players_file)
print(games[games.gameId==2022090800])
sorted_game_tpc = sorted(game_tpc.items(), key=lambda x: x[1], reverse=True)
for key, value in sorted_game_tpc: 
    player = players[players.nflId==key].iloc[0]
    print(player.displayName, "TPC over game: ", value)


### Take 2: Parallelized code

In [5]:
games_file = './adata/games.csv'
games = pd.read_csv(games_file)
games.head(10)

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,09/08/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,09/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,09/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,09/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,09/11/2022,13:00:00,CIN,PIT,20,23
5,2022091104,2022,1,09/11/2022,13:00:00,DET,PHI,35,38
6,2022091105,2022,1,09/11/2022,13:00:00,HOU,IND,20,20
7,2022091106,2022,1,09/11/2022,13:00:00,MIA,NE,20,7
8,2022091107,2022,1,09/11/2022,13:00:00,NYJ,BAL,9,24
9,2022091109,2022,1,09/11/2022,13:00:00,WAS,JAX,28,22


In [None]:
plays_file = './data/games.csv'
plays = pd.read_csv(plays_file)
plays.head()

In [None]:
LA_BUF_20220908 = NFLUtils.analyze_game(game_id=2022090800, tracking_file='./data/tracking_week_1.csv')

In [None]:
ATL_NO_20220911 = NFLUtils.analyze_game(game_id=2022091100, tracking_file='./data/tracking_week_1.csv')

In [3]:
CAR_CLE_20220911 = NFLUtils.analyze_game(game_id=2022091101, tracking_file='./data/tracking_week_1.csv')

158
85
109
272
296
184
251
213
361
382
417
489
521
542
620
Error processing play 361: 44820
641
662
748
850
993
1077
1101
1516
1720
1744
Error processing play 993: 44898
1785
1901
1945
1980
Error processing play 1516: 46104
2051
2320
2341
2365
Error processing play 1101: 44820
2386
Error processing play 2051: 46104
2407
2478
2501
Error processing play 2386: 46104
2545
Error processing play 2407: 44898
2629
Error processing play 2501: 44898
2683
2783
2832
2909
2930
Error processing play 2629: 44820
2951
3040
3080
3101
3125
3221
Error processing play 3125: 44820
3263
Error processing play 3040: 44820
3315
3336
3357
3378
Error processing play 3336: 44898
3399
3545
3569
3591
Error processing play 3315: 46104
3615
3669
3707
3789
3841
3862
3923
Error processing play 3615: 46104
3961
Error processing play 3569: 46104
4068
4104
4150
Error processing play 3789: 46093


In [4]:
CHI_SF_20220911 = NFLUtils.analyze_game(game_id=2022091102, tracking_file='./data/tracking_week_1.csv')

145
343
322
364
86
531
467
574
Error processing play 343: 47856
698
Error processing play 86: 47856
756
Error processing play 531: 53623
800
900
921
989
1029
Error processing play 921: 53623
1050
1162
Error processing play 900: 53623
1265
1363
Error processing play 989: 47856
1406
1472
1493
1517
1588
1631
Error processing play 1472: 46377
1794
1837
Error processing play 1050: 47856
1869
1954
Error processing play 1588: 46377
2065
Error processing play 1794: 53646
2132
2238
2281
2352
2394
2415
Error processing play 2394: 46377
2511
2532
Error processing play 2352: 46377
2556
2618
2717
2738
Error processing play 2511: 47856
2759
Error processing play 2618: 47856
2783
2945
Error processing play 2556: 47856
3022
3043
3336
3381
3428
Error processing play 2783: 46377
3470
3502
Error processing play 3502: 53646
3628
3695
Error processing play 2738: 47819
3783
3859
3943
3981
4019
Error processing play 3943: 47856


rosetta error: ThreadContext::resume failed 4


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [6]:
CIN_PIT_20220911 = NFLUtils.analyze_game(game_id=2022091103, tracking_file='./data/tracking_week_1.csv')

58
295
319
364
274
253
388
340
Error processing play 388: 44860
520
611
632
719
743
Error processing play 743: 53453
764
881
Error processing play 764: 53453
902
955
1037
Error processing play 881: 44860
1105
1126
1150
1171
1228
1315
Error processing play 1228: 52457
1336
1363
1384
1493
1565
1589
Error processing play 1384: 44860
1610
1648
Error processing play 1610: 44860
1700
Error processing play 1589: 44860
1926
1947
2037
Error processing play 1363: 44860
2058
2201
2222
2291
Error processing play 2201: 44860
2315
2483
2511
Error processing play 2511: 53453
2569
2611
2670
2719
2780
Error processing play 2483: 53453
2907
2931
Error processing play 2037: 53453
2955
2976
3011


rosetta error: ThreadContext::resume failed 268435459


KeyboardInterrupt: 

In [4]:
start_time = time.time()
for label, row in games.iterrows(): 
    print(time.time - start_time)
    try: 
        print(row)
        game_id = row.gameId
        week = row.week
        results = NFLUtils.analyze_game(game_id=game_id, tracking_file=f'./data/tracking_week_{week}.csv')
        time.sleep(30)  # doing this to not melt my processor overnight
    except: 
        continue

gameId               2022090800
season                     2022
week                          1
gameDate             09/08/2022
gameTimeEastern        20:20:00
homeTeamAbbr                 LA
visitorTeamAbbr             BUF
homeFinalScore               10
visitorFinalScore            31
Name: 0, dtype: object
56
101
167
122
146
191
212
299
343
393
414
Error processing play 191: 52494
486
Error processing play 299: 47853
529
569
593
Error processing play 393: 47853
617
646
692
775
818
933
1030
1102
1187
1230
1254
1334
Error processing play 1102: 52494
1358
1385
1406
1712
Error processing play 1187: 47853
1736
1836
Error processing play 1358: 44881
1946
1967
2043
2072
2163
2184
2208
2336
2360
2485
2506
2527
2551
2572
2599
2688
2815
2860
2884
2934
Error processing play 2506: 52494
3121
3145
3166
Error processing play 2551: 52494
3190
3283
Error processing play 3121: 43399
3341
3362
3383
3407
3431
Error processing play 3283: 47857
3489
3513
Error processing play 3145: 47853
3576
3636
gameI

rosetta error: thread_suspend failed


gameId               2022091101
season                     2022
week                          1
gameDate             09/11/2022
gameTimeEastern        13:00:00
homeTeamAbbr                CAR
visitorTeamAbbr             CLE
homeFinalScore               24
visitorFinalScore            26
Name: 2, dtype: object
213
158
184
85
109
251
296
272
361
382
417
489
521
542
620
Error processing play 361: 44820
641
662
748
850


rosetta error: ThreadContext::resume failed 268435459


gameId               2022091102
season                     2022
week                          1
gameDate             09/11/2022
gameTimeEastern        13:00:00
homeTeamAbbr                CHI
visitorTeamAbbr              SF
homeFinalScore               19
visitorFinalScore            10
Name: 3, dtype: object
364
343
86
322
145
531
574
467
