In [1]:
%matplotlib inline

# Notebook for training predictive models
### Import packages

In [2]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import load_model as keras_load_model
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Model
from sklearn.pipeline import Pipeline
from keras.models import Sequential

import tensorflow as tf
import pandas as pd
import numpy as np
import random
import glob
import os

from visualize_game import visualize_frame_prediction, visualize_prediction_animation, visualize_game_animation
from settings import *

2024-03-12 15:38:10.921469: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Global variables

In [3]:
numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y']
categorical_cols = ['team_direction', 'role']
y_cols = ['x_future', 'y_future']

### Helper functions

In [4]:
# Split the games into train, test, and validtion. This way, each game will be treated seperatly
def split_match_ids(match_ids, train_size=0.7, test_size=0.1, val_size=0.2, random_state=42):
    # Calculate the remaining size after the test and validation sizes are removed
    remaining_size = 1.0 - train_size

    # Check if the sum of sizes is not equal to 1
    if remaining_size < 0 or abs(train_size + test_size + val_size - 1.0) > 1e-6:
        raise ValueError("The sum of train_size, test_size, and val_size must be equal to 1.")
    
    # Split the match IDs into train, test, and validation sets
    train_ids, remaining_ids = train_test_split(match_ids, train_size=train_size, random_state=random_state)
    val_ids, test_ids = train_test_split(remaining_ids, test_size=test_size / remaining_size, random_state=random_state)
    
    return train_ids, test_ids, val_ids
    
# Helper function for NN with embeddings models
def adjust_for_embeddings(X_data_df):
    # Split the DataFrame into numerical and categorical components
    X_numerical = X_data_df.drop(columns=categorical_cols)
    X_categorical = {col: X_data_df[col].values for col in categorical_cols}
    
    return X_numerical, X_categorical

# Get the latest model, for a given 'model_name', based on the number of current models
def get_latest_model_filename(model_name):
    models_folder = "./models/"

    # Get a list of existing model filenames in the models folder
    existing_models = [filename for filename in os.listdir(models_folder) if filename.startswith(model_name) and filename.endswith('.h5')]

    # Sort the existing models by name
    existing_models.sort()

    if existing_models:
        # Get the latest model filename
        latest_model_filename = existing_models[-1]
    else:
        ValueError(f"No existing model available with the given name: models/{model_name}_XX.h5")

    return os.path.join(models_folder, latest_model_filename)

# Load a tf model
def load_model(model_path):
    try:
        # Load the model using Keras's load_model function
        return keras_load_model(model_path)
    
    except ValueError as e:
        print(e)
        return None

In [5]:
# Prepare data before training
def prepare_data(frames_dfs):

    # Initialize lists to store features and labels
    X_data = []
    y_data = []

    # For each game
    for frames_df in frames_dfs:
        # Fill NaN values with zeros for numerical columns
        frames_df[numerical_cols] = frames_df[numerical_cols].fillna(0)

        # Drop rows with NaN values in the labels (y)
        frames_df.dropna(subset=y_cols, inplace=True)

        # Extract features and labels from group
        X = frames_df[numerical_cols + categorical_cols]
        y = frames_df[y_cols]

        # Append the data
        X_data.append(X)
        y_data.append(y)

    # Concatenate the lists to create the final feature and label DataFrame
    X_data_df = pd.concat(X_data)
    y_data_df = pd.concat(y_data)

    # Apply label encoding to categorical variables
    for col in categorical_cols:
        label_encoder = LabelEncoder()
        X_data_df[col] = label_encoder.fit_transform(X_data_df[col])

    # Define column transformer for standard scaling numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols)
        ],
        remainder='passthrough'
    )

    # Create pipeline for preprocessing and apply it to X_data
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    X_data_scaled = pipeline.fit_transform(X_data_df)

    # Retrieve the transformed feature names from ColumnTransformer
    transformed_column_names = numerical_cols + categorical_cols

    # Create a DataFrame from the preprocessed feature data
    X_data_scaled_df = pd.DataFrame(X_data_scaled, columns=transformed_column_names)

    # Convert categorical columns to int
    X_data_scaled_df[categorical_cols] = X_data_scaled_df[categorical_cols].astype('int8')

    return X_data_scaled_df, y_data_df

### Load frames

In [6]:
# Load the processed/frames
def load_all_processed_frames():
    # Create DataFrame for storing all frames
    frames_dfs = []
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_PROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_PROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths][0:10]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_PROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)
                
                # Append the DataFrame to frames_dfs
                frames_dfs.append(frames_df)

    return frames_dfs

# Load every frames_df to a list
frames_dfs = load_all_processed_frames()

# Create an internal match_id for each game
match_ids = range(len(frames_dfs))

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
# train_frames_dfs = [frames_dfs[i] for i in train_ids]
test_frames_dfs = [frames_dfs[i] for i in test_ids]
# val_frames_dfs = [frames_dfs[i] for i in val_ids]

In [7]:
# Prepare the test data
X_test, y_test = prepare_data(test_frames_dfs)

# Adjust for embeddings
X_test_numerical, X_test_categorical = adjust_for_embeddings(X_test)

# Construct input data suitable for the embedding layers
X_test_input = [X_test_categorical['team_direction'].reshape(-1, 1), X_test_categorical['role'].reshape(-1, 1), X_test_numerical]

### Load predictive models

In [8]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

# NAIVE: Always predict that all players will continue with the same velocity and acceleration
# The calculations are based on x, y, v_x, v_y, a_x, and a_y
def predict_two_seconds_naive_acceleration(frames_df):
    # Calculate future positions using kinematic equations
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future + 0.5 * frames_df['a_x'] * (seconds_into_the_future ** 2)
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future + 0.5 * frames_df['a_y'] * (seconds_into_the_future ** 2)

# Make a prediction with a LSTM neural network model
def predict_two_seconds_LSTM(frames_df):
    return 

In [9]:
# Make a prediction with a neural network model
def predict_two_seconds_NN_model(frames_dfs, model, X_data):
    # Concatenate the frames DataFrames into a single large DataFrame
    frames_concatenated_df = pd.concat(frames_dfs, ignore_index=True)

    # Make predictions using the loaded tf model
    predictions = model.predict(X_data)

    # Extract the predicted values
    x_future_pred = predictions[:, 0]
    y_future_pred = predictions[:, 1]

    # Add the predicted values to 'frames_concatenated_df'
    frames_concatenated_df['x_future_pred'] = x_future_pred
    frames_concatenated_df['y_future_pred'] = y_future_pred

    return frames_concatenated_df

### Functions for calculating error

In [10]:
# Add a column for distance wrongly predicted (in metres) for each object
def add_pred_error(frames_df):
    # Create a vector with the Eculidian distance between the true position and the predicted position
    frames_df['pred_error'] = round(((frames_df['x_future_pred'] - frames_df['x_future'])**2 + (frames_df['y_future_pred'] - frames_df['y_future'])**2)**0.5, 2)
    
# Add a column for distance wrongly predicted (in metres) for each object. Also return average_pred_error
def total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=True):
    # Add 'pred_error' column if empty
    if 'pred_error' not in frames_df:
        add_pred_error(frames_df)
    
    # Create a new column to store modified pred_error values
    frames_df['pred_error_tmp'] = frames_df['pred_error']
    
    # If specified, set pred_error to None for frames where the ball is not in motion
    if ball_has_to_be_in_motion:
        frames_df.loc[frames_df["ball_in_motion"] != True, 'pred_error_tmp'] = None

    # If specified, set pred_error to None for rows where 'team' is 'ball'
    if not include_ball:
        frames_df.loc[frames_df['team'] == 'ball', 'pred_error_tmp'] = None

    # Calculate average pred_error_tmp, excluding rows where pred_error is None
    average_pred_error = frames_df['pred_error_tmp'].mean()

    # Drop the temporary column
    frames_df.drop(columns=['pred_error_tmp'], inplace=True)

    return round(average_pred_error, 2)

# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    # Predict the future positions
    [predict_function(frames_df) for frames_df in frames_dfs]
    [add_pred_error(frames_df) for frames_df in frames_dfs]

    # Concatenate all frames dataframes into a single dataframe
    concatted_frames_df = pd.concat(frames_dfs)    
    
    # Calculate the total error loss
    error = total_error_loss(concatted_frames_df, include_ball, ball_has_to_be_in_motion)
    
    return error

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

# Use a model to make predictions on a set of games, and calculate the error
def predict_and_evaluate(model, X_data, frames_dfs, include_ball=False, ball_has_to_be_in_motion=True):
    # Concatenate the frames DataFrames into a single large DataFrame
    frames_concatenated_df = pd.concat(frames_dfs, ignore_index=True)

    # If model is a tf model
    if isinstance(model, tf.keras.Model):
        # Make predictions using the loaded tf model
        predictions = model.predict(X_data)

        # Extract the predicted values
        x_future_pred = predictions[:, 0]
        y_future_pred = predictions[:, 1]

        # Add the predicted values to 'frames_concatenated_df'
        frames_concatenated_df['x_future_pred'] = x_future_pred
        frames_concatenated_df['y_future_pred'] = y_future_pred
        model(frames_concatenated_df, )

    else:
        # Use the custom function to make the predictions
        model(frames_concatenated_df)

    # Calculate error
    error = total_error_loss(frames_concatenated_df, include_ball, ball_has_to_be_in_motion)

    return error

## Evaulate models
### Visualize prediction errors

In [11]:
# # Test model 1
# error_naive_static = calculate_average_error(test_frames_dfs, predict_two_seconds_naive_static, include_ball=False, ball_has_to_be_in_motion=True)
# print(f"Model error naive static: {error_naive_static}")

# # Test model 2
# error_naive_velocity = calculate_average_error(test_frames_dfs, predict_two_seconds_naive_velocity, include_ball=False, ball_has_to_be_in_motion=True)
# print(f"Model error naive velocity: {error_naive_velocity}")

# # Visualize one frame with average error for the first model, together with the corresponding frame for the second model
# frames_df = test_frames_dfs[0]
# frame_with_average_error = find_frame_with_average_error(frames_df, error_naive_static, error_margin=0.1)
# visualize_frame_prediction(frames_df, frame_with_average_error, "naive_static")
# visualize_frame_prediction(frames_df, frame_with_average_error, "naive_velocity")

In [20]:
# # Visualize the predictions of the naive velocity model in an animation
# test_frames_df = test_frames_dfs[0]
# predict_two_seconds_naive_velocity(test_frames_df)
# total_error_loss(test_frames_df)
# visualize_prediction_animation(test_frames_df, 250, 750, "naive_velocity")

### Evaulate the NAIVE models with different parameters

In [13]:
# # Define the prediction functions (models) you want to test
# prediction_functions = {
#     "Naive Static": predict_two_seconds_naive_static,
#     "Naive Velocity": predict_two_seconds_naive_velocity,
#     "Naive Acceleration": predict_two_seconds_naive_acceleration
# }

# # Define the combinations of include_ball and ball_has_to_be_in_motion
# combinations = [(True, True), (True, False), (False, True), (False, False)]

# # Initialize an empty list to store the results
# results = []

# # Loop through each combination
# for include_ball, ball_has_to_be_in_motion in combinations:
#     # Add the combination of parameters
#     result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
#     # Loop through each prediction function (model)
#     for model_name, predict_function in prediction_functions.items():
#         # Calculate error for the current prediction function (model)
#         error = predict_and_evaluate(predict_function, test_frames_dfs, include_ball, ball_has_to_be_in_motion)
#         result[model_name] = error
    
#     # Append the results to the list
#     results.append(result)

# # Create a DataFrame from the list of results
# results_df = pd.DataFrame(results)

# # Print the resulting DataFrame
# results_df

### Evaluate non-NAIVE models with parameters

In [14]:
# # Define the prediction functions (models) you want to test
# NN_prediction_functions = {
#     "NN_model_3": {"model": load_model("models/NN_model_3.h5"), "X_data": X_test[numerical_cols]},
#     "NN_embedding_model_1": {"model": load_model("models/NN_embedding_model_1.h5"), "X_data": X_test_input}
# }

# # Define the combinations of include_ball and ball_has_to_be_in_motion
# combinations = [(True, True), (True, False), (False, True), (False, False)]

# # Initialize an empty list to store the results
# results = []

# # Pre-calculate predictions for each model only once
# predictions_dict = {}
# for model_name, model_info in NN_prediction_functions.items():
#     # Make predictions for the current model and store them in the dictionary
#     predictions_dict[model_name] = predict_two_seconds_NN_model(test_frames_dfs, model_info["model"], model_info["X_data"])

# # Loop through each combination
# for include_ball, ball_has_to_be_in_motion in combinations:
#     # Initialize a result dictionary for the current combination
#     result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
    
#     # Calculate and store the error for each model within the same combination
#     for model_name in NN_prediction_functions.keys():
#         # Retrieve the pre-calculated predictions for the current model
#         test_with_predictions_df = predictions_dict[model_name]
#         # Calculate error
#         error = total_error_loss(test_with_predictions_df, include_ball, ball_has_to_be_in_motion)
#         # Store the error in the result dictionary
#         result[model_name] = error
    
#     # Append the result to the results list
#     results.append(result)

# # Create a DataFrame from the list of results
# results_df = pd.DataFrame(results)

# # Print the resulting DataFrame
# results_df