### Import packages

In [1]:
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
from mplsoccer import Pitch
import pandas as pd
import glob
import os

from settings import *
from visualize_game import visualize_frame_prediction, visualize_game_snippet, visualize_offside_frame

### General functions

In [2]:
# Flip the coordinates to match the team direction
def flip_xy_based_on_team_direction(frames_df):
    # TODO: What do to with the ball?
    for period in [1, 2]:
        # Flip the x coordinates for the team attacking to the left
        home_team_attacking_to_right = frames_df[(frames_df['period'] == period) & (frames_df['team'] == 'home_team')].iloc[0]['team_direction'] == 'right'
        if home_team_attacking_to_right:
            frames_df.loc[(frames_df['period'] == period) & (frames_df['team'] == 'away_team'), 'x'] = pitch_length - frames_df[(frames_df['period'] == period) & (frames_df['team'] == 'away_team')]['x']
        else:
            frames_df.loc[(frames_df['period'] == period) & (frames_df['team'] == 'home_team'), 'x'] = pitch_length - frames_df[(frames_df['period'] == period) & (frames_df['team'] == 'home_team')]['x']
    return frames_df

### Functions for adding features

In [3]:
# Add the features x_future and y_future (the x and y coordinate of each player n frames into the future)
def add_xy_future(frames_df, n=50):
    # Shift the DataFrame by n frames for each player
    future_df = frames_df.groupby(['team', 'jersey_number']).shift(-n)

    # Merge the original DataFrame with the shifted DataFrame to get future coordinates
    frames_df[['x_future', 'y_future']] = future_df[['x', 'y']]

In [4]:
# Add the features v_x and v_y (current velocity (m/s) in the x and y axis respectivly). delta_frames determines the time stamp
def add_velocity_xy(frames_df, delta_frames=1):
    # Create a copy of the DataFrame and shift it by delta_frames
    past_df = frames_df.copy()
    past_df['frame'] += delta_frames

    # Merge the original DataFrame with the shifted DataFrame to get future coordinates
    past_coordinates = frames_df.merge(past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_past'), how='outer')

    # Use the past coordinates to calculate the current velocity
    v_x = (frames_df['x'] - past_coordinates['x_past']) * FPS / delta_frames
    v_y = (frames_df['y'] - past_coordinates['y_past']) * FPS / delta_frames
    
    # The player can't surely run faster than Usian Bolt's max speed 
    usain_bolt_max_speed = 13
    frames_df['v_x'] = v_x.clip(lower=-usain_bolt_max_speed, upper=usain_bolt_max_speed)
    frames_df['v_y'] = v_y.clip(lower=-usain_bolt_max_speed, upper=usain_bolt_max_speed)

In [5]:
# Add the features a_x and a_y (current velocity (m/s²) in the x and y axis respectivly). delta_frames determines the time stamp
def add_acceleration_xy(frames_df, delta_frames=1):
    # Create a copy of the DataFrame and shift it by delta_frames twice
    past_df = frames_df.copy()
    past_df['frame'] += delta_frames
    more_past_df = frames_df.copy()
    more_past_df['frame'] += 2 * delta_frames

    # Merge the original DataFrame with the shifted DataFrames to get past and future coordinates
    past_coordinates = frames_df.merge(past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_past'), how='outer')
    more_past_coordinates = frames_df.merge(more_past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_more_past'), how='outer')

    # Use past and future coordinates to calculate current acceleration
    a_x = ((frames_df['x'] - 2 * past_coordinates['x_past'] + more_past_coordinates['x_more_past']) * FPS / (delta_frames ** 2)).fillna(0)
    a_y = ((frames_df['y'] - 2 * past_coordinates['y_past'] + more_past_coordinates['y_more_past']) * FPS / (delta_frames ** 2)).fillna(0)

    # Clip acceleration values to reasonable limits
    max_acceleration = 10  # This is a very high acceleration
    frames_df['a_x'] = a_x.clip(lower=-max_acceleration, upper=max_acceleration)
    frames_df['a_y'] = a_y.clip(lower=-max_acceleration, upper=max_acceleration)

In [6]:
# Add a vector indicating if the ball is in motion
def add_ball_in_motion(frames_df):
    # Initialize variables
    ball_in_motion_vec = []
    x_ball = 0
    y_ball = 0
    i = - 1

    # For all objects in each frame
    while (i < len(frames_df)-1):        
        # Update i to be the last row in the next frame
        objects_tracked = frames_df.iloc[i+1]['objects_tracked']
        i += objects_tracked

        # Determine if the ball is motion
        ball_in_motion = False
        # If the ball exists, it will surely be the last row
        if frames_df.iloc[i]['team'] == 'ball':
            # If either x_ball or y_ball has changed since the last recorded positions
            if x_ball != frames_df.iloc[i]['x'] or y_ball != frames_df.iloc[i]['y']:
                # Update varibles
                x_ball = frames_df.iloc[i]['x']
                y_ball = frames_df.iloc[i]['y']
                ball_in_motion = True

        # Store the result in ball_in_motion_vec
        [ball_in_motion_vec.append(ball_in_motion) for _ in range(objects_tracked)]

    # Add the new column based on the vector
    frames_df['ball_in_motion'] = ball_in_motion_vec

In [7]:
# Add a vector for determining which players that are standing behind the offside line
def add_offside(frames_df):
    # Create the empty column
    frames_df["offside"] = None

    # Group the DataFrame by frame
    grouped_frames = frames_df.groupby("frame")

    # Iterate over each unique frame
    for frame, frame_df in grouped_frames:
        # Ball has to exist in order for the calculation to work
        if frame_df.iloc[-1]['team'] == 'ball':
            # Find x_ball
            x_ball = frame_df.iloc[-1]['x']
        else:
            # Setting ball to half way line will make the position of the ball irrelevant
            x_ball = pitch_length / 2

        # Find the x coordinates of each team
        x_players_attacking_right = sorted(frame_df[frame_df["team_direction"] == 'left']['x'].tolist())
        x_players_attacking_left = sorted(frame_df[frame_df["team_direction"] == 'right']['x'].tolist())

        # Find offside for the team attacking to the right
        if len(x_players_attacking_left) >= 2:
            # Find the x of the second to last defender
            x_second_to_last_defender = x_players_attacking_left[-2]

            # The offside will be determined by the second to last defender, half way line, or ball
            x_offside_line = max(x_second_to_last_defender, pitch_length / 2, x_ball)

            # Determine which players that are standing behind the offside line
            for x_player in x_players_attacking_right:
                if x_player > x_offside_line:
                    # Set 'offside' to value of x_offside_line
                    frames_df.loc[(frames_df["frame"] == frame) & (frames_df["team_direction"] == 'left') & (frames_df["x"] == x_player), "offside"] = x_offside_line

        # Find offsides on the left side of the pitch
        if len(x_players_attacking_right) >= 2:
            # Find the x of the second to last defender
            x_second_to_last_defender = x_players_attacking_right[1]

            # The offside will be determined by the second to last defender, half way line, or ball
            x_offside_line = min(x_second_to_last_defender, pitch_length / 2, x_ball)

            # Determine which players that are standing behind the offside line
            for x_player in x_players_attacking_left:
                if x_player < x_offside_line:
                    # Set 'offside' to value of x_offside_line
                    frames_df.loc[(frames_df["frame"] == frame) & (frames_df["team_direction"] == 'right') & (frames_df["x"] == x_player), "offside"] = x_offside_line

# TODO: Examine that this is correct
# Create a small df for testing
# small_frames_df = frames_dfs[0].head(1000).copy()
# add_offside(small_frames_df)
# offsides_df = small_frames_df.groupby("frame").filter(lambda x: x['offside'].notna().any()).copy()
# offsides_df[["team_name", "player", "jersey_number", "x", "y", "second", "frame"]]
# frame = 1546
# visualize_offside_frame(offsides_df, frame)

In [17]:
small_frames_df = frames_dfs[0].head(40000).copy()
add_offside(small_frames_df)
visualize_game_snippet(small_frames_df, 0, 1500)

### Predictive models

In [9]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

# Make a prediction with a LSTM neural network model
def predict_two_seconds_LSTM(frames_df):
    return 

### Calculate error loss function

In [10]:
# Add a column for distance wrongly predicted (in metres) for each object. Also return average_pred_error
def total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=False):
    # Create a vector with the Eculidian distance between the true position and the predicted position
    frames_df['pred_error'] = round(((frames_df['x_future_pred'] - frames_df['x_future'])**2 + (frames_df['y_future_pred'] - frames_df['y_future'])**2)**0.5, 2)
    
    # If ball_has_to_be_in_motion, filter to only look at frames where the ball is in motion
    if ball_has_to_be_in_motion:
        frames_df = frames_df[frames_df["ball_in_motion"]].copy()

    # Calculate average pred_error
    if include_ball:
        average_pred_error = frames_df['pred_error'].mean()
    else:
        # Calculate average pred_error for all entries where team != 'ball'
        average_pred_error = frames_df[frames_df['team'] != 'ball']['pred_error'].mean()

    return round(average_pred_error, 2)

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    errors = []  # Initialize a list to store errors for each game

    # Iterate over each DataFrame in frames_dfs for testing the model
    for frames_df in frames_dfs:
        # Predict game outcome
        predict_function(frames_df)
        # Calculate and append the error for the predicted outcome
        error = total_error_loss(frames_df, include_ball, ball_has_to_be_in_motion)
        errors.append(error)

    # Calculate the average error
    avg_error = round(sum(errors) / len(errors), 2)
    return avg_error

### Functions for processing and loading frames

In [11]:
# Process the unprocessed/ frames, and store the results to the processed/ fodler
def process_frames():
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_UNPROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/unprocessed"
            FOLDER_OUT = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"
            
            # Create output folder if not exists
            if not os.path.exists(FOLDER_OUT):
                    os.makedirs(FOLDER_OUT)

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_UNPROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            # TODO: Uncomment this line in production
            # match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths][0:2]

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_UNPROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)

                # Process frames_df
                flip_xy_based_on_team_direction(frames_df)
                add_velocity_xy(frames_df, 1)
                add_acceleration_xy(frames_df, 1)
                add_xy_future(frames_df, FPS*seconds_into_the_future)
                add_ball_in_motion(frames_df)

                # Add match_id
                frames_df["match_id"] = match_id

                # Convert DataFrame to a parquet file
                frames_df.to_parquet(f"{FOLDER_OUT}/{match_id}.parquet")

# Load the processed/frames
def load_all_processed_frames():
    # Create DataFrame for storing all frames
    frames_dfs = []
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_PROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_PROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_PROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)
                
                # Append the DataFrame to frames_dfs
                frames_dfs.append(frames_df)

    return frames_dfs

## Run an example

In [12]:
# Load frames_df
# process_frames()
frames_dfs = load_all_processed_frames()

In [13]:
# Test model 1
error_naive_static = calculate_average_error(frames_dfs, predict_two_seconds_naive_static, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive static: {error_naive_static}")

# Test model 2
error_naive_velocity = calculate_average_error(frames_dfs, predict_two_seconds_naive_velocity, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive velocity: {error_naive_velocity}")

# Visualize one frame with average error for the first model, together with the corresponding frame for the second model
frames_df = frames_dfs[0]
frame_with_average_error = find_frame_with_average_error(frames_df, error_naive_static, error_margin=0.1)
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_static")
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_velocity")

Model error naive static: 4.04
Model error naive velocity: 2.17


### Print results for different models and parameters

In [14]:
# # Define the prediction functions (models) you want to test
# prediction_functions = {
#     "Naive Static": predict_two_seconds_naive_static,
#     "Naive Velocity": predict_two_seconds_naive_velocity
# }

# # Initialize an empty list to store the results
# results = []

# # Define the combinations of include_ball and ball_has_to_be_in_motion
# combinations = [(True, True), (True, False), (False, True), (False, False)]

# # Load all processed frames
# frames_dfs = load_all_processed_frames()

# # Loop through each combination
# for include_ball, ball_has_to_be_in_motion in combinations:
#     # Add the combination of parameters
#     result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
#     # Loop through each prediction function (model)
#     for model_name, predict_function in prediction_functions.items():
#         # Calculate average error for the current prediction function (model)
#         avg_error = calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion)
#         result[model_name] = avg_error
    
#     # Append the results to the list
#     results.append(result)

# # Create a DataFrame from the list of results
# results_df = pd.DataFrame(results)

# # Print the results DataFrame
# results_df

### Store df as xlsx

In [15]:
# # Store frames_df as xslx
# frames_df_head = frames_df.head(19979)

# # Specify the file path for the Excel file
# excel_file_path = f"{DATA_LOCAL_FOLDER}/Brommapojkarna_vs_Sirius.xlsx"

# # Write the DataFrame to an Excel file
# frames_df_head.to_excel(excel_file_path, index=False)

# print(f"DataFrame saved to {excel_file_path}")

In [16]:
from collections import OrderedDict
import json
import os

DATA_FOLDER_UNPROCESSED = f"{DATA_LOCAL_FOLDER}/signality/2022/Allsvenskan/"

# Find all frames parquet files
match_paths = glob.glob(os.path.join(DATA_FOLDER_UNPROCESSED, "*.json"))

# Initialize a set to store unique team names
unique_team_names = set()

# Iterate over each JSON file
for json_file in match_paths:
    # Load JSON data
    with open(json_file, 'r') as f:
        data = json.load(f, object_pairs_hook=OrderedDict)

    # Extract team_home_name
    team_home_name = data.get('team_home_name')
    
    # Add team_home_name to the set of unique team names
    if team_home_name:
        unique_team_names.add(team_home_name)

# Convert set to list for easier manipulation if needed
unique_team_names_list = list(unique_team_names)

# Print unique team names
print(unique_team_names_list)

['BK Häcken', 'Varbergs BoIS FC', 'IF Elfsborg', 'Kalmar FF', 'GIF Sundsvall', 'Helsingborgs IF', 'Östers IF', 'Mjällby AIF', 'IFK Norrköping FK', 'Malmö FF', 'AIK', 'IFK Värnamo', 'IFK Göteborg', 'Djurgården', 'IK Sirius FK', 'Hammarby', 'Degerfors IF']
