In [1]:
%matplotlib inline

# Notebook for training predictive models
### Import packages

In [2]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import load_model as keras_load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Model
from sklearn.pipeline import Pipeline
from keras.models import Sequential

import tensorflow as tf
import pandas as pd
import numpy as np
import random
import glob
import os

from utils import load_processed_frames, split_match_ids, euclidean_distance_loss, total_error_loss, smooth_predictions_xy, extract_variables, load_tf_model, prepare_df, prepare_EL_input_data, add_can_be_sequentialized
from visualize_game import visualize_frame_prediction, visualize_prediction_animation, visualize_game_animation
from settings import *

2024-04-08 17:47:21.244687: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Global variables

In [3]:
n_matches = 10

### Load frames

In [4]:
# Load every frames_df to a list
frames_dfs = load_processed_frames(n_matches=n_matches)

# Create an internal match_id for each game
match_ids = range(n_matches)

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
# train_frames_dfs = [frames_dfs[i] for i in train_ids]
test_frames_dfs = [frames_dfs[i] for i in test_ids]
# val_frames_dfs = [frames_dfs[i] for i in val_ids]

## Define NAIVE models

In [5]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

    # Clip values to stay on the pitch
    frames_df['x_future_pred'] = frames_df['x_future_pred'].clip(lower=0, upper=pitch_length)
    frames_df['y_future_pred'] = frames_df['y_future_pred'].clip(lower=0, upper=pitch_width)

    # Smooth the predicted coordinates
    smooth_predictions_xy(frames_df, alpha=0.95)

# NAIVE: Always predict that all players will continue with the same velocity and acceleration
# The calculations are based on x, y, v_x, v_y, a_x, and a_y
def predict_two_seconds_naive_acceleration(frames_df):
    # Calculate future positions using kinematic equationsnaive_
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future + 0.5 * frames_df['a_x'] * (seconds_into_the_future ** 2)
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future + 0.5 * frames_df['a_y'] * (seconds_into_the_future ** 2)

    # Clip values to stay on the pitch
    frames_df['x_future_pred'] = frames_df['x_future_pred'].clip(lower=0, upper=pitch_length)
    frames_df['y_future_pred'] = frames_df['y_future_pred'].clip(lower=0, upper=pitch_width)

    # Smooth the predicted coordinates
    smooth_predictions_xy(frames_df, alpha=0.95)

### Helper functions

In [6]:
# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    # Predict the future positions
    [predict_function(frames_df) for frames_df in frames_dfs]
    [add_pred_error(frames_df) for frames_df in frames_dfs]

    # Concatenate all frames dataframes into a single dataframe
    concatted_frames_df = pd.concat(frames_dfs)    
    
    # Calculate the total error loss
    error = total_error_loss(concatted_frames_df, include_ball, ball_has_to_be_in_motion)
    
    return error

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

# Use a naive model to make predictions on a set of games, and calculate the error
def predict_and_evaluate_naive_model(naive_model, frames_dfs, include_ball=False, ball_has_to_be_in_motion=True):
    # Concatenate the frames DataFrames into a single large DataFrame
    frames_concatenated_df = pd.concat(frames_dfs, ignore_index=True)

    # Use the custom function to make the predictions
    naive_model(frames_concatenated_df)

    # Calculate error
    error = total_error_loss(frames_concatenated_df, include_ball, ball_has_to_be_in_motion)

    return error

## Evaulate NAIVE models
### Visualize prediction errors

In [7]:
# # Visualize the predictions of the naive velocity model in an animation
# test_frames_df = train_frames_dfs[3]
# predict_two_seconds_naive_static(test_frames_df)
# total_error_loss(test_frames_df)
# visualize_prediction_animation(test_frames_df, 250, 750, "naive_static")

### Evaulate the NAIVE models with different parameters

In [8]:
# Define the prediction functions (models) you want to test
prediction_functions = {
    "Naive Static": predict_two_seconds_naive_static,
    "Naive Velocity": predict_two_seconds_naive_velocity,
    "Naive Acceleration": predict_two_seconds_naive_acceleration
}

# Define the combinations of include_ball and ball_has_to_be_in_motion
# combinations = [(True, True), (True, False), (False, True), (False, False)]
combinations = [(False, True)]

# Initialize an empty list to store the results
results = []

# Loop through each combination
for include_ball, ball_has_to_be_in_motion in combinations:
    # Add the combination of parameters
    result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
    # Loop through each prediction function (model)
    for model_name, predict_function in prediction_functions.items():
        # Calculate error for the current prediction function (model)
        error = predict_and_evaluate_naive_model(predict_function, test_frames_dfs, include_ball, ball_has_to_be_in_motion)
        result[model_name] = error
    
    # Append the results to the list
    results.append(result)

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results)

# Print the resulting DataFrame
results_df

Unnamed: 0,Include Ball,Ball in Motion,Naive Static,Naive Velocity,Naive Acceleration
0,False,True,4.069,2.148,2.117


## Evaluate NN models

In [62]:
# Define denominators for normalization
denominators = {
    'x': pitch_length,
    'y': pitch_width,
    'v_y': 13,
    'v_y': 13,
    'a_y': 10,
    'a_y': 10,
    'acc': 20,
    'pac': 20,
    'sta': 20,
    'height': 2.10,
    'weight': 110,
    'distance_to_ball': round(np.sqrt((pitch_length**2 + pitch_width**2)), 2),
    'angle_to_ball': 360,
    'orientation': 360,
    'tiredness': 10,
    'minute': 45,
    'period': 2,
}
y_cols = ['x_future', 'y_future']

""" Dont include the above """

# Add the columns 'sequential_numerical_data', 'sequential_categorical_data', and 'future_xy' as arrays
def add_sequentialized_data(X_df, numerical_cols, categorical_cols, sequence_length, downsampling_factor):
    # Combine the values in y_df with X_df
    X_df['future_xy'] = X_df[y_cols].values.tolist()

    # Create a vector containing a list of all values in the numerical columns
    X_df['numerical_data_list'] = X_df[numerical_cols].values.tolist()
    
    # Create a similar list for the categorical columns, if any
    if categorical_cols:
        X_df['categorical_data_list'] = X_df[categorical_cols].apply(lambda x: x.tolist(), axis=1)

    # Add vector 'can_be_sequentialized'
    add_can_be_sequentialized(X_df, sequence_length=sequence_length)

    # Group by each unique player
    grouped = X_df.groupby(['team', 'jersey_number', 'match_id'])

    # Iterate through each player and create sequences
    for _, group in grouped:
        # Create temporary columns with shifted version of 'numerical_cols' and 'categorical_cols'
        # Shift it with the downsampling_factor since that is how the training process works
        for i in range(sequence_length):
            group[f'numerical_data_list_{i}'] = group["numerical_data_list"].shift(i * downsampling_factor)

        # Concatenate the temporary columns to create the column 'sequential_numerical_data'
        columns_to_sequentialize = [f'numerical_data_list_{i}' for i in range(sequence_length)][::-1]
        group['sequential_numerical_data'] = group[columns_to_sequentialize].values.tolist()
        
        print(group[['player', 'frame', 'future_xy', 'can_be_sequentialized']])

        # Only consider rows that can be sequentialized
        group = group[group['can_be_sequentialized']]

        # Add the sequentialized data to X_df using loc
        X_df.loc[group.index, 'sequential_numerical_data'] = group['sequential_numerical_data']
        
        if categorical_cols:
            X_df.loc[group.index, 'sequential_categorical_data'] = group['categorical_data_list']

    return X_df

def prepare_LSTM_evaluation_input_data(X_df, numerical_cols, categorical_cols, sequence_length, positions=[]):
    # Apply label encoding to categorical variables
    for col in categorical_cols:
        label_encoder = LabelEncoder()
        X_df[col] = label_encoder.fit_transform(X_df[col])

    # Apply custom normalization
    for col in numerical_cols:
        if col in denominators:
            X_df[col] = X_df[col] / denominators[col]

    # Convert categorical columns to int
    X_df[categorical_cols] = X_df[categorical_cols].astype('int8')

    # Convert numerical columns to float
    X_df[numerical_cols] = X_df[numerical_cols].astype('float32')

    # Add vectors with sequentialized data
    X_df = add_sequentialized_data(X_df, numerical_cols, categorical_cols, sequence_length, downsampling_factor)

    # print(X_df[['player', 'frame', 'future_xy', 'sequential_numerical_data']])

    # Create Pandas Series
    y_seq = X_df[X_df['can_be_sequentialized']]['future_xy']
    X_seq_num = X_df[X_df['can_be_sequentialized']]['sequential_numerical_data']
    X_seq_cat = X_df[X_df['can_be_sequentialized']]['sequential_categorical_data']

    # Convert the Pandas Series of lists to a NumPy array
    X_seq_num_np = np.array(X_seq_num.tolist()).astype('float32')
    y_seq_np = np.array(y_seq.tolist()).astype('float32')

    # Add the data from categorical columns to X_seq_np
    if categorical_cols:
        X_seq_cat_np = np.array(X_seq_cat.tolist()).astype('float32')
        X_seq_np = [X_seq_cat_np, X_seq_num_np]

        return X_seq_np, y_seq_np

    # Return the resuls without adding categorical data
    else:
        return X_seq_num_np, y_seq_np

In [57]:
# Example usage: run_model(test_frames_dfs, "NN_model_v1") 
def run_model(frames_dfs, model_name):
    # Load varibles
    numerical_cols, categorical_cols, positions, sequence_length = extract_variables(model_name)

    # Load model
    model = load_tf_model(f"models/{model_name}.h5", euclidean_distance_loss=True)

    # Prepare the input data for LSTM model
    if "LSTM" in model_name:
        # Prepared the DataFrames and concatenate into a single large DataFrame
        prepared_frames_dfs = [prepare_df(frames_df, numerical_cols, categorical_cols, positions=positions, downsampling_factor=1) for frames_df in frames_dfs]
        frames_concat_df = pd.concat(prepared_frames_dfs, ignore_index=True)

        # Preapre input data for LSTM evaluation 
        X_test_input, y_test = prepare_LSTM_evaluation_input_data(frames_concat_df, numerical_cols, categorical_cols, sequence_length)

        print("Prepare step completed")

        # Make predictions using the loaded tf model
        predictions = model.predict(X_test_input)

        # Extract the predicted values
        x_future_pred = predictions[:, 0]
        y_future_pred = predictions[:, 1]

        # Add the predicted values to 'frames_concat_df'
        frames_concat_df.loc[frames_concat_df['can_be_sequentialized'], 'x_future_pred'] = x_future_pred#.clip(lower=0, upper=pitch_length)
        frames_concat_df.loc[frames_concat_df['can_be_sequentialized'], 'y_future_pred'] = y_future_pred#.clip(lower=0, upper=pitch_width)

    # Prepare the input data for non-LSTM model
    else:
        # Prepared the DataFrames and concatenate into a single large DataFrame
        prepared_frames_dfs = [prepare_df(frames_df, numerical_cols, categorical_cols, positions=positions, downsampling_factor=downsampling_factor) for frames_df in frames_dfs]
        frames_concat_df = pd.concat(prepared_frames_dfs, ignore_index=True)

        X_test_input, y_test = prepare_EL_input_data(frames_dfs, numerical_cols, categorical_cols, positions)

        # Make predictions using the loaded tf model
        predictions = model.predict(X_test_input)

        # Extract the predicted values
        x_future_pred = predictions[:, 0]
        y_future_pred = predictions[:, 1]

        # Add the predicted values to 'frames_concat_df'
        frames_concat_df.loc['x_future_pred'] = x_future_pred#.clip(lower=0, upper=pitch_length)
        frames_concat_df.loc['y_future_pred'] = y_future_pred#.clip(lower=0, upper=pitch_width)

    return frames_concat_df

In [60]:
frames_df = test_frames_dfs[0]
# frames_df = frames_df[frames_df['frame'] % 5 == 0]
frames_df = frames_df[frames_df['team_direction'] == 'right']
frames_df = frames_df[frames_df['position'] == "Goalkeeper"].iloc[0:15]
frames_df['x'] = round(frames_df['x'], 1)
frames_df['y'] = round(frames_df['y'], 1)
frames_df['v_x'] = round(frames_df['v_x'], 1)

add_can_be_sequentialized(frames_df, 10)

# frames_df[['player', 'frame', 'x', 'y', 'v_x', 'x_future', 'y_future', 'can_be_sequentialized']]

In [61]:
frames_concat_df = run_model([frames_df], "LSTM_model_v3")

          player  frame      future_xy  can_be_sequentialized
0   Johan Dahlin      0  [8.16, 32.98]                  False
1   Johan Dahlin      1  [8.16, 32.96]                  False
2   Johan Dahlin      2  [8.18, 32.94]                  False
3   Johan Dahlin      3  [8.21, 32.94]                  False
4   Johan Dahlin      4  [8.25, 32.95]                  False
5   Johan Dahlin      5   [8.3, 32.96]                  False
6   Johan Dahlin      6  [8.35, 32.98]                  False
7   Johan Dahlin      7  [8.42, 33.01]                  False
8   Johan Dahlin      8  [8.48, 33.04]                  False
9   Johan Dahlin      9  [8.54, 33.07]                  False
10  Johan Dahlin     10    [8.6, 33.1]                  False
11  Johan Dahlin     11  [8.66, 33.13]                  False
12  Johan Dahlin     12   [8.7, 33.16]                  False
13  Johan Dahlin     13  [8.74, 33.17]                  False
14  Johan Dahlin     14  [8.76, 33.19]                  False
Prepare 

ValueError: Unexpected result of `predict_function` (Empty batch_outputs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

## Evaluate on different parameters

In [None]:
# # Print column variance for position
# column_to_analyze = 'position'
# model_name = "LSTM_model_v9"
# print_column_variance(val_frames_dfs, model_name, column_to_analyze)