### Import packages

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
from mplsoccer import Pitch
# import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import os


from settings import *
from visualize_game import visualize_frame_prediction, visualize_game_snippet, visualize_offside_frame

### General functions

In [2]:
# Flip the coordinates to match the 'team_direction'
def flip_xy_based_on_team_direction(frames_df):
    # Flip x and y whenever 'team_direction' is 'left'
    frames_df.loc[(frames_df['team_direction'] == 'left'), 'x'] = pitch_length - frames_df['x']
    frames_df.loc[(frames_df['team_direction'] == 'left'), 'y'] = pitch_width - frames_df['y']

    # Always flip the ball
    frames_df.loc[(frames_df['team_direction'] == 'ball'), 'x'] = pitch_length - frames_df['x']
    frames_df.loc[(frames_df['team_direction'] == 'ball'), 'y'] = pitch_width - frames_df['y']

### Helper functions

In [3]:
def add_x_ball(frames_df):
    # Create an 'x_ball' column with dtype float
    x_ball = pd.Series(dtype=float, index=frames_df['frame'])

    # Fill the values in 'x_ball' with the 'x' of the ball
    ball_positions = frames_df.loc[frames_df['team'] == 'ball', ['frame', 'x']].set_index('frame')['x']
    x_ball.update(ball_positions)

    # Add the 'x_ball' column to the DataFrame
    frames_df["x_ball"] = x_ball.values

def add_x_ball_prev(frames_df, frames_to_shift=1):
    # Create an 'x_ball_prev' column with dtype float
    x_ball = pd.Series(dtype=float, index=frames_df['frame'])
    
    # Shift the coordinates one frame
    ball_positions = frames_df.loc[frames_df['team'] == 'ball', ['frame', 'x']].set_index('frame')['x'].shift(frames_to_shift)
    x_ball.update(ball_positions)

    # Add the 'x_ball_prev' column to the DataFrame
    frames_df["x_ball_prev"] = x_ball.values

def add_y_ball(frames_df):
    # Create a 'y_ball' column with dtype float
    y_ball = pd.Series(dtype=float, index=frames_df['frame'])

    # Fill the values in 'y_ball' with the 'y' of the ball
    ball_positions = frames_df.loc[frames_df['team'] == 'ball', ['frame', 'y']].set_index('frame')['y']
    y_ball.update(ball_positions)

    # Add the 'y_ball' column to the DataFrame
    frames_df["y_ball"] = y_ball.values

def add_y_ball_prev(frames_df, frames_to_shift=1):
    # Create a 'y_ball_prev' column with dtype float
    y_ball = pd.Series(dtype=float, index=frames_df['frame'])
    
    # Shift the coordinates one frame
    ball_positions = frames_df.loc[frames_df['team'] == 'ball', ['frame', 'y']].set_index('frame')['y'].shift(frames_to_shift)
    y_ball.update(ball_positions)

    # Add the 'y_ball_prev' column to the DataFrame
    frames_df["y_ball_prev"] = y_ball.values

### Functions for adding features

In [4]:
# Add the features x_future and y_future (the x and y coordinate of each player n frames into the future)
def add_xy_future(frames_df, n=50):
    # Shift the DataFrame by n frames for each player
    future_df = frames_df.groupby(['team', 'jersey_number']).shift(-n)

    # Merge the original DataFrame with the shifted DataFrame to get future coordinates
    frames_df[['x_future', 'y_future']] = future_df[['x', 'y']]

In [5]:
# Add the features v_x and v_y (current velocity (m/s) in the x and y axis respectivly). delta_frames determines the time stamp
def add_velocity_xy(frames_df, delta_frames=1):
    # Create a copy of the DataFrame and shift it by delta_frames
    past_df = frames_df.copy()
    past_df['frame'] += delta_frames

    # Merge the original DataFrame with the shifted DataFrame to get future coordinates
    past_coordinates = frames_df.merge(past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_past'), how='outer')

    # Use the past coordinates to calculate the current velocity
    v_x = (frames_df['x'] - past_coordinates['x_past']) * FPS / delta_frames
    v_y = (frames_df['y'] - past_coordinates['y_past']) * FPS / delta_frames
    
    # The player can't surely run faster than Usian Bolt's max speed 
    usain_bolt_max_speed = 13
    frames_df['v_x'] = v_x.clip(lower=-usain_bolt_max_speed, upper=usain_bolt_max_speed)
    frames_df['v_y'] = v_y.clip(lower=-usain_bolt_max_speed, upper=usain_bolt_max_speed)

In [6]:
# Add the features a_x and a_y (current velocity (m/s²) in the x and y axis respectivly). delta_frames determines the time stamp
def add_acceleration_xy(frames_df, delta_frames=1):
    # Create a copy of the DataFrame and shift it by delta_frames twice
    past_df = frames_df.copy()
    past_df['frame'] += delta_frames
    more_past_df = frames_df.copy()
    more_past_df['frame'] += 2 * delta_frames

    # Merge the original DataFrame with the shifted DataFrames to get past and future coordinates
    past_coordinates = frames_df.merge(past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_past'), how='outer')
    more_past_coordinates = frames_df.merge(more_past_df, on=['frame', 'team', 'jersey_number'], suffixes=('', '_more_past'), how='outer')

    # Use past and future coordinates to calculate current acceleration
    a_x = ((frames_df['x'] - 2 * past_coordinates['x_past'] + more_past_coordinates['x_more_past']) * FPS / (delta_frames ** 2)).fillna(0)
    a_y = ((frames_df['y'] - 2 * past_coordinates['y_past'] + more_past_coordinates['y_more_past']) * FPS / (delta_frames ** 2)).fillna(0)

    # Clip acceleration values to reasonable limits
    max_acceleration = 10  # This is a very high acceleration
    frames_df['a_x'] = a_x.clip(lower=-max_acceleration, upper=max_acceleration)
    frames_df['a_y'] = a_y.clip(lower=-max_acceleration, upper=max_acceleration)

In [7]:
# Add a vector indicating if the ball is in motion
def add_ball_in_motion(frames_df):
    # Initialize the 'ball_in_motion' column with False for all rows
    frames_df['ball_in_motion_opt'] = False
    
    # Add 'x_ball' and 'y_ball'columns
    add_x_ball(frames_df)
    add_y_ball(frames_df)

    # Add 'x_ball_prev' and 'y_ball_prev' columns
    add_x_ball_prev(frames_df)
    add_y_ball_prev(frames_df)

    # Update the 'ball_in_motion' column to True if 'x_ball' or 'y_ball' exists, and any of the coordinates have changed
    frames_df.loc[(frames_df['x_ball'].notna()) & (frames_df['x_ball'] != frames_df['x_ball_prev']), 'ball_in_motion'] = True
    frames_df.loc[(frames_df['y_ball'].notna()) & (frames_df['y_ball'] != frames_df['y_ball_prev']), 'ball_in_motion'] = True

    # Drop unnecessary columns
    frames_df.drop(columns=["x_ball", "x_ball_prev", "y_ball", "y_ball_prev"], inplace=True)

In [8]:
# Add a vector with the 'x' position of the second to last defender, for both team directions
def add_second_to_last_defender(frames_df):
    # Sort the DataFrame based on 'team', 'frame', 'x'
    sorted_frames_df = frames_df.sort_values(by=['team', 'frame', 'x']).copy()

    # Find the x coordinates of players attacking left and right for each frame
    x_players_attacking_left = sorted_frames_df[sorted_frames_df["team_direction"] == 'left'].groupby("frame")["x"].apply(list)
    x_players_attacking_right = sorted_frames_df[sorted_frames_df["team_direction"] == 'right'].groupby("frame")["x"].apply(list)

    # Find the x of the second to last defender
    x_second_to_last_player_left = x_players_attacking_left.apply(lambda x: x[-2] if len(x) >= 2 else pitch_length / 2)
    x_second_to_last_player_right = x_players_attacking_right.apply(lambda x: x[1] if len(x) >= 2 else pitch_length / 2)

    # Add 'x_second_to_last_player_left' and 'x_second_to_last_player_right' columns
    frames_df["x_second_to_last_player_left"] = x_second_to_last_player_left.reindex(frames_df['frame']).values
    frames_df["x_second_to_last_player_right"] = x_second_to_last_player_right.reindex(frames_df['frame']).values

# Add a vector with the 'offside_line'
def add_offside_line(frames_df):
    # Create a vector for the values of the half way line
    frames_df["half_way_line"] = pitch_length / 2

    # Add 'x_ball' column and fill None values with the half way line
    add_x_ball(frames_df)
    frames_df['x_ball'].fillna(pitch_length / 2, inplace=True)

    # Add 'x_second_to_last_player_left' and 'x_second_to_last_player_right' columns
    add_second_to_last_defender(frames_df)

    # Update "offside_line" column based on team direction
    frames_df["offside_line"] = np.where(
        frames_df["team_direction"] == 'right',
        # If team_direction is 'right', the offside line will be the max value of the second to last defender, ball, and half way line
        np.maximum.reduce([frames_df["x_second_to_last_player_left"], frames_df["x_ball"], frames_df["half_way_line"]]),
        # If team_direction is 'left', the offside line will be the min value of the second to last defender, ball, and half way line
        np.minimum.reduce([frames_df["x_second_to_last_player_right"], frames_df["x_ball"], frames_df["half_way_line"]])
    )

    # Set offside line to half way line if the 'team' is ball
    frames_df.loc[frames_df['team'] == 'ball', 'offside_line'] = pitch_length / 2

    # Drop unnecessary columns
    frames_df.drop(columns=["half_way_line", "x_ball", "x_second_to_last_player_left", "x_second_to_last_player_right"], inplace=True)

# Add a vector the sets the value to 'offside_line' if a player is standing in an offside position
def add_offside(frames_df):
    # Add 'offside_line' column
    add_offside_line(frames_df)
    
    # Create the empty column
    frames_df["offside"] = None
    
    # Fill the 'offside' column based on conditions
    frames_df.loc[(frames_df['team_direction'] == 'right') & (frames_df['x'] > frames_df['offside_line']), 'offside'] = frames_df['offside_line']
    frames_df.loc[(frames_df['team_direction'] == 'left') & (frames_df['x'] < frames_df['offside_line']), 'offside'] = frames_df['offside_line']

    # Drop the 'offside_line' column
    frames_df.drop(columns=["offside_line"], inplace=True)

In [9]:
# TODO: Fix 'team_direction'

### Functions for processing and loading frames

In [10]:
# Process the unprocessed/ frames, and store the results to the processed/ fodler
def process_frames():
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_UNPROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/unprocessed"
            FOLDER_OUT = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"
            
            # Create output folder if not exists
            if not os.path.exists(FOLDER_OUT):
                    os.makedirs(FOLDER_OUT)

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_UNPROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            # TODO: Uncomment this line in production
            # match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths][10:11]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_UNPROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)

                # Process frames_df
                flip_xy_based_on_team_direction(frames_df)
                add_velocity_xy(frames_df, 1)
                add_acceleration_xy(frames_df, 1)
                add_xy_future(frames_df, FPS*seconds_into_the_future)
                add_ball_in_motion(frames_df)
                add_offside(frames_df)

                # Add match_id
                frames_df["match_id"] = match_id

                # Convert DataFrame to a parquet file
                frames_df.to_parquet(f"{FOLDER_OUT}/{match_id}.parquet")

                # Print that the match is processed
                print(f"Match {match_id} is processed")

# Load the processed/frames
def load_all_processed_frames():
    # Create DataFrame for storing all frames
    frames_dfs = []
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_PROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_PROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_PROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)
                
                # Append the DataFrame to frames_dfs
                frames_dfs.append(frames_df)

    return frames_dfs

In [11]:
# process_frames()
frames_dfs = load_all_processed_frames()

### Predictive models

In [12]:
# Split the games into train, test, and validtion. This way, each game will be treated seperatly
def split_match_ids(match_ids, train_size=0.7, test_size=0.1, val_size=0.2, random_state=42):
    # Calculate the remaining size after the test and validation sizes are removed
    remaining_size = 1.0 - test_size - val_size

    # Check if the sum of sizes is not equal to 1
    if remaining_size < 0 or abs(train_size + test_size + val_size - 1.0) > 1e-6:
        raise ValueError("The sum of train_size, test_size, and val_size must be equal to 1.")
    
    # Split the match IDs into train, test, and validation sets
    train_ids, remaining_ids = train_test_split(match_ids, train_size=train_size, random_state=random_state)
    test_ids, val_ids = train_test_split(remaining_ids, test_size=test_size / remaining_size, random_state=random_state)
    
    return train_ids, test_ids, val_ids

In [13]:
import tensorflow as tf

def prepare_data(frames_dfs):
    # Define numerical and categorical columns
    numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'distance_ran', 'minute']
    categorical_cols = ['role', 'team_direction']

    # Initialize lists to store features and labels
    X_data = []
    y_data = []

    for frames_df in frames_dfs:
        # Fill NaN values with zeros for numerical columns
        frames_df[numerical_cols] = frames_df[numerical_cols].fillna(0)

        # Drop rows with NaN values in the labels (y)
        frames_df.dropna(subset=['x_future', 'y_future'], inplace=True)

        # Extract features and labels from frames_df
        X = frames_df[numerical_cols + categorical_cols]
        y = frames_df[['x_future', 'y_future']]

        # Add features and labels to the lists
        X_data.append(X)
        y_data.append(y)

    # Concatenate the lists to create the final feature and label DataFrame
    X_data = pd.concat(X_data)
    y_data = pd.concat(y_data)

    # Define column transformer for one-hot encoding team_direction
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ],
        remainder='passthrough'
    )

    # Create pipeline for preprocessing and apply it to X_data
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    X_data_scaled = pipeline.fit_transform(X_data)

    return X_data_scaled, y_data

def train_NN(train_frames_dfs, val_frames_dfs):
    # Prepare the data for training
    X_train, y_train = prepare_data(train_frames_dfs)

    # Prepare the data for validation
    X_val, y_val = prepare_data(val_frames_dfs)

    # Train the model
    train_model(X_train, y_train, X_val, y_val, val_frames_dfs)

def define_model(input_shape):
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(2)  # Output layer with 2 units for x_future and y_future
    ])

    return model

def train_model(X_train, y_train, X_val, y_val, val_frames_dfs):
    # Define the model
    model = define_model(X_train.shape[1])

    # Compile the model
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

    # Validate the model
    validate_model(model, X_val, y_val, val_frames_dfs)

def validate_model(model, X_val, y_val, val_frames_dfs):
    for frames_df in val_frames_dfs:
        # Predict future positions
        y_pred = model.predict(X_val)

        # Add predicted values to DataFrame
        frames_df['x_future_pred'] = y_pred[:, 0]  # Predicted x_future values
        frames_df['y_future_pred'] = y_pred[:, 1]  # Predicted y_future values

        # Calculate and print the error using the provided error function
        error = total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=True)
        print("Total error loss:", error)

2024-03-01 22:23:34.191790: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [14]:
# Create a match_id for each game
match_ids = range(len(frames_dfs))

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
train_frames_dfs = [frames_dfs[i] for i in train_ids]
test_frames_dfs = [frames_dfs[i] for i in test_ids]
val_frames_dfs = [frames_dfs[i] for i in val_ids]

In [15]:
# Train the NN model
train_NN(train_frames_dfs, val_frames_dfs)

2024-03-01 22:24:22.619814: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79266 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:ca:00.0, compute capability: 8.0


Epoch 1/10


In [15]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

# Make a prediction with a LSTM neural network model
def predict_two_seconds_LSTM(frames_df):
    return 

### Calculate error loss function

In [28]:
# Add a column for distance wrongly predicted (in metres) for each object
def add_pred_error(frames_df):
    # Create a vector with the Eculidian distance between the true position and the predicted position
    frames_df['pred_error'] = round(((frames_df['x_future_pred'] - frames_df['x_future'])**2 + (frames_df['y_future_pred'] - frames_df['y_future'])**2)**0.5, 2)
    
# Add a column for distance wrongly predicted (in metres) for each object. Also return average_pred_error
def total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=True):
    # Add 'pred_error' column if empty
    if frames_df['pred_error'].empty:
        add_pred_error(frames_df)
    
    # Create a new column to store modified pred_error values
    frames_df['pred_error_tmp'] = frames_df['pred_error']
    
    # If specified, set pred_error to None for frames where the ball is not in motion
    if ball_has_to_be_in_motion:
        frames_df.loc[frames_df["ball_in_motion"] != True, 'pred_error_tmp'] = None

    # If specified, set pred_error to None for rows where 'team' is 'ball'
    if not include_ball:
        frames_df.loc[frames_df['team'] == 'ball', 'pred_error_tmp'] = None

    # Calculate average pred_error_tmp, excluding rows where pred_error is None
    average_pred_error = frames_df['pred_error_tmp'].mean()

    # Drop the temporary column
    frames_df.drop(columns=['pred_error_tmp'], inplace=True)

    return round(average_pred_error, 2)

# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    # Predict the future positions
    [predict_function(frames_df) for frames_df in frames_dfs]
    [add_pred_error(frames_df) for frames_df in frames_dfs]

    # Concatenate all frames dataframes into a single dataframe
    concatted_frames_df = pd.concat(frames_dfs)    
    
    # Calculate the total error loss
    error = total_error_loss(concatted_frames_df, include_ball, ball_has_to_be_in_motion)
    
    return error

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

## Run an example

In [18]:
# Test model 1
error_naive_static = calculate_average_error(frames_dfs, predict_two_seconds_naive_static, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive static: {error_naive_static}")

# Test model 2
error_naive_velocity = calculate_average_error(frames_dfs, predict_two_seconds_naive_velocity, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive velocity: {error_naive_velocity}")

# Visualize one frame with average error for the first model, together with the corresponding frame for the second model
frames_df = frames_dfs[0]
frame_with_average_error = find_frame_with_average_error(frames_df, error_naive_static, error_margin=0.1)
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_static")
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_velocity")

Model error naive static: 3.97
Model error naive velocity: 2.04


### Print results for different models and parameters

In [19]:
# # Define the prediction functions (models) you want to test
# prediction_functions = {
#     "Naive Static": predict_two_seconds_naive_static,
#     "Naive Velocity": predict_two_seconds_naive_velocity
# }

# # Initialize an empty list to store the results
# results = []

# # Define the combinations of include_ball and ball_has_to_be_in_motion
# combinations = [(True, True), (True, False), (False, True), (False, False)]

# # Load all processed frames
# frames_dfs = load_all_processed_frames()

# # Loop through each combination
# for include_ball, ball_has_to_be_in_motion in combinations:
#     # Add the combination of parameters
#     result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
#     # Loop through each prediction function (model)
#     for model_name, predict_function in prediction_functions.items():
#         # Calculate average error for the current prediction function (model)
#         avg_error = calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion)
#         result[model_name] = avg_error
    
#     # Append the results to the list
#     results.append(result)

# # Create a DataFrame from the list of results
# results_df = pd.DataFrame(results)

# # Print the results DataFrame
# results_df