In [1]:
%matplotlib inline

# Notebook for training predictive models
### Import packages

In [2]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import load_model as keras_load_model
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Model
from sklearn.pipeline import Pipeline
from keras.models import Sequential

import tensorflow as tf
import pandas as pd
import numpy as np
import random
import glob
import os

from utils import load_processed_frames, split_match_ids, get_next_model_filename, euclidean_distance_loss, adjust_for_embeddings
from visualize_game import visualize_frame_prediction, visualize_prediction_animation, visualize_game_animation
from settings import *

2024-03-25 11:34:14.080779: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Global variables

In [3]:
numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y']
categorical_cols = ['team_direction', 'role']
y_cols = ['x_future', 'y_future']
n_matches = 10

### Helper functions

In [4]:
# Load a tf model
def load_model(model_path, euclidean_distance_loss=False):
    try:
        # Load the model using Keras's load_model function
        if euclidean_distance_loss:
            # Define custom_objects dictionary with the custom loss function
            custom_objects = {'euclidean_distance_loss': euclidean_distance_loss}
            return keras_load_model(model_path, custom_objects=custom_objects) 
        else:
            return keras_load_model(model_path)
    
    except ValueError as e:
        print(e)
        return None

# Smooth the vectors 'x_future_pred' and 'y_future_pred'
def smooth_predictions_xy(frames_df, alpha=0.93):
    # Group by unique combinations of 'team', 'jersey_number', and 'match_id'
    grouped = frames_df.groupby(['team', 'jersey_number', 'match_id'])
    
    # Apply the Exponential Moving Average filter to smooth the predictions
    def apply_ema(x):
        return x.ewm(alpha=alpha, adjust=False).mean()

    frames_df['x_future_pred'] = grouped['x_future_pred'].transform(apply_ema)
    frames_df['y_future_pred'] = grouped['y_future_pred'].transform(apply_ema)

### Load frames

In [6]:
# Load every frames_df to a list
# frames_dfs = load_processed_frames(n_matches=n_matches)

# Create an internal match_id for each game
match_ids = range(n_matches)

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
# train_frames_dfs = [frames_dfs[i] for i in train_ids]
# test_frames_dfs = [frames_dfs[i] for i in test_ids]
# val_frames_dfs = [frames_dfs[i] for i in val_ids]
# test_frames_dfs = [load_processed_frames(match_id=test_id)[0] for test_id in test_ids]
test_frames_dfs = load_processed_frames(n_matches=n_matches)

### Load predictive models

In [13]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

    # Clip values to stay on the pitch
    frames_df['x_future_pred'] = frames_df['x_future_pred'].clip(lower=0, upper=pitch_length)
    frames_df['y_future_pred'] = frames_df['y_future_pred'].clip(lower=0, upper=pitch_width)

    # Smooth the predicted coordinates
    smooth_predictions_xy(frames_df, alpha=0.95)

# NAIVE: Always predict that all players will continue with the same velocity and acceleration
# The calculations are based on x, y, v_x, v_y, a_x, and a_y
def predict_two_seconds_naive_acceleration(frames_df):
    # Calculate future positions using kinematic equationsnaive_
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future + 0.5 * frames_df['a_x'] * (seconds_into_the_future ** 2)
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future + 0.5 * frames_df['a_y'] * (seconds_into_the_future ** 2)

    # Clip values to stay on the pitch
    frames_df['x_future_pred'] = frames_df['x_future_pred'].clip(lower=0, upper=pitch_length)
    frames_df['y_future_pred'] = frames_df['y_future_pred'].clip(lower=0, upper=pitch_width)

    # Smooth the predicted coordinates
    smooth_predictions_xy(frames_df, alpha=0.95)

In [None]:
# Make a prediction with a neural network model
def predict_two_seconds_NN_model(frames_dfs, model, X_data):
    # Concatenate the frames DataFrames into a single large DataFrame
    frames_concatenated_df = pd.concat(frames_dfs, ignore_index=True)

    # Make predictions using the loaded tf model
    predictions = model.predict(X_data)

    # Extract the predicted values
    x_future_pred = predictions[:, 0]
    y_future_pred = predictions[:, 1]

    # Add the predicted values to 'frames_concatenated_df'
    frames_concatenated_df['x_future_pred'] = x_future_pred
    frames_concatenated_df['y_future_pred'] = y_future_pred

    return frames_concatenated_df

### Functions for calculating error

In [8]:
# Add a column for distance wrongly predicted (in metres) for each object
def add_pred_error(frames_df):
    # Create a vector with the Eculidian distance between the true position and the predicted position
    frames_df['pred_error'] = round(((frames_df['x_future_pred'] - frames_df['x_future'])**2 + (frames_df['y_future_pred'] - frames_df['y_future'])**2)**0.5, 2)
    
# Add a column for distance wrongly predicted (in metres) for each object. Also return average_pred_error
def total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=True):
    # Add 'pred_error' column
    add_pred_error(frames_df)
    
    # Create a new column to store modified pred_error values
    frames_df['pred_error_tmp'] = frames_df['pred_error']
    
    # If specified, set pred_error to None for frames where the ball is not in motion
    if ball_has_to_be_in_motion:
        frames_df.loc[frames_df["ball_in_motion"] != True, 'pred_error_tmp'] = None

    # If specified, set pred_error to None for rows where 'team' is 'ball'
    if not include_ball:
        frames_df.loc[frames_df['team'] == 'ball', 'pred_error_tmp'] = None

    # Calculate average pred_error_tmp, excluding rows where pred_error is None
    average_pred_error = frames_df['pred_error_tmp'].mean()

    # Drop the temporary column
    frames_df.drop(columns=['pred_error_tmp'], inplace=True)

    return round(average_pred_error, 4)

# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    # Predict the future positions
    [predict_function(frames_df) for frames_df in frames_dfs]
    [add_pred_error(frames_df) for frames_df in frames_dfs]

    # Concatenate all frames dataframes into a single dataframe
    concatted_frames_df = pd.concat(frames_dfs)    
    
    # Calculate the total error loss
    error = total_error_loss(concatted_frames_df, include_ball, ball_has_to_be_in_motion)
    
    return error

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

# Use a model to make predictions on a set of games, and calculate the error
def predict_and_evaluate(model, frames_dfs, include_ball=False, ball_has_to_be_in_motion=True):
    # Concatenate the frames DataFrames into a single large DataFrame
    frames_concatenated_df = pd.concat(frames_dfs, ignore_index=True)

    # Use the custom function to make the predictions
    model(frames_concatenated_df)

    # Calculate error
    error = total_error_loss(frames_concatenated_df, include_ball, ball_has_to_be_in_motion)

    return error

## Evaulate models
### Visualize prediction errors

In [None]:
# # Visualize the predictions of the naive velocity model in an animation
# test_frames_df = train_frames_dfs[3]
# predict_two_seconds_naive_static(test_frames_df)
# total_error_loss(test_frames_df)
# visualize_prediction_animation(test_frames_df, 250, 750, "naive_static")

### Evaulate the NAIVE models with different parameters

In [14]:
# Define the prediction functions (models) you want to test
prediction_functions = {
    # "Naive Static": predict_two_seconds_naive_static,
    "Naive Velocity": predict_two_seconds_naive_velocity,
    "Naive Acceleration": predict_two_seconds_naive_acceleration
}

# Define the combinations of include_ball and ball_has_to_be_in_motion
combinations = [(True, True), (True, False), (False, True), (False, False)]

# Initialize an empty list to store the results
results = []

# Loop through each combination
for include_ball, ball_has_to_be_in_motion in combinations:
    # Add the combination of parameters
    result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
    # Loop through each prediction function (model)
    for model_name, predict_function in prediction_functions.items():
        # Calculate error for the current prediction function (model)
        error = predict_and_evaluate(predict_function, test_frames_dfs, include_ball, ball_has_to_be_in_motion)
        result[model_name] = error
    
    # Append the results to the list
    results.append(result)

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results)

# Print the resulting DataFrame
results_df

Unnamed: 0,Include Ball,Ball in Motion,Naive Velocity,Naive Acceleration
0,True,True,2.691,2.672
1,True,False,2.3104,2.2966
2,False,True,2.1372,2.1095
3,False,False,1.9466,1.9276


### Evaluate non-NAIVE models with parameters

In [13]:
# Define the prediction functions (models) you want to test
NN_prediction_functions = {
    # "NN_model_7": {"model": load_model("models/NN_model_7.h5", euclidean_distance_loss=True), "X_data": X_test[numerical_cols]},
    # "NN_embedding_model_8": {"model": load_model("models/NN_embedding_model_8.h5", euclidean_distance_loss=True), "X_data": X_test_input},
    # "NN_embedding_model_9": {"model": load_model("models/NN_embedding_model_9.h5", euclidean_distance_loss=True), "X_data": X_test_input},
    # "NN_embedding_model_10": {"model": load_model("models/NN_embedding_model_10.h5", euclidean_distance_loss=True), "X_data": X_test_input},
    "NN_embedding_model_12": {"model": load_model("models/NN_embedding_model_12.h5", euclidean_distance_loss=True), "X_data": X_test_input}
}

# Define the combinations of include_ball and ball_has_to_be_in_motion
combinations = [(True, True), (True, False), (False, True), (False, False)]

# Initialize an empty list to store the results
results = []

# Pre-calculate predictions for each model only once
predictions_dict = {}
for model_name, model_info in NN_prediction_functions.items():
    # Make predictions for the current model and store them in the dictionary
    predictions_dict[model_name] = predict_two_seconds_NN_model(test_frames_dfs, model_info["model"], model_info["X_data"])

# Loop through each combination
for include_ball, ball_has_to_be_in_motion in combinations:
    # Initialize a result dictionary for the current combination
    result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
    
    # Calculate and store the error for each model within the same combination
    for model_name in NN_prediction_functions.keys():
        # Retrieve the pre-calculated predictions for the current model
        test_with_predictions_df = predictions_dict[model_name]
        # Calculate error
        error = total_error_loss(test_with_predictions_df, include_ball, ball_has_to_be_in_motion)
        # Store the error in the result dictionary
        result[model_name] = error
    
    # Append the result to the results list
    results.append(result)

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results)

# Print the resulting DataFrame
results_df



2024-03-22 12:58:48.583257: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38546 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:4b:00.0, compute capability: 8.0
2024-03-22 12:58:48.583868: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38546 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:e3:00.0, compute capability: 8.0


   169/402249 [..............................] - ETA: 6:01

2024-03-22 12:58:52.498103: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




Unnamed: 0,Include Ball,Ball in Motion,NN_embedding_model_12
0,True,True,5.17
1,True,False,5.07
2,False,True,4.76
3,False,False,4.79


In [14]:
# Visualize the predictions of a NN model in an animation

# frames_df = train_frames_dfs[3]

# # Fill NaN values with zeros for numerical columns
# frames_df[numerical_cols] = frames_df[numerical_cols].fillna(0)

# # Drop rows with NaN values in the labels (y)
# frames_df.dropna(subset=y_cols, inplace=True)

# # Drop rows where 'team' is ball, if specified
# frames_df = frames_df[frames_df['team'] != 'ball']

# # Drop rows where ball is not in motion, if specified
# frames_df = frames_df[frames_df['ball_in_motion']]

# # Extract features and labels from group
# X_data_df = frames_df[numerical_cols + categorical_cols]
# y_data_df = frames_df[y_cols]

# # Apply label encoding to categorical variables
# for col in categorical_cols:
#     label_encoder = LabelEncoder()
#     X_data_df[col] = label_encoder.fit_transform(X_data_df[col])

# # Define column transformer for standard scaling numerical features
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols)
#     ],
#     remainder='passthrough'
# )

# # Create pipeline for preprocessing and apply it to X_data
# pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
# X_data_scaled = pipeline.fit_transform(X_data_df)

# # Retrieve the transformed feature names from ColumnTransformer
# transformed_column_names = numerical_cols + categorical_cols

# # Create a DataFrame from the preprocessed feature data
# X_data_scaled_df = pd.DataFrame(X_data_scaled, columns=transformed_column_names)

# # Convert categorical columns to int
# X_data_scaled_df[categorical_cols] = X_data_scaled_df[categorical_cols].astype('int8')

# X_train = X_data_scaled_df
# y_train = y_data_df

# # Adjust for embeddings
# X_train_numerical, X_train_categorical = adjust_for_embeddings(X_train, categorical_cols)

# # Construct input data suitable for the embedding layers
# X_train_input = [X_train_categorical['team_direction'].reshape(-1, 1), X_train_categorical['role'].reshape(-1, 1), X_train_numerical]

# frames_df = predict_two_seconds_NN_model([frames_df], load_model("models/NN_embedding_model_5.h5", euclidean_distance_loss=True), X_train_input)

# error = total_error_loss(frames_df)

# print(error)

# The frame interval cant contain missing frames
# visualize_prediction_animation(frames_df, 3800, 4200, "NN_embedding_model_5")

## Evaluate

In [15]:
# Calculates how the average 'pred_error' varies with each value in 'column_to_analyze'
def find_column_variance(frames_df, column_to_analyze):
    # Convert 'pred_error' to numeric, coercing non-numeric values to NaN
    frames_df['pred_error'] = pd.to_numeric(frames_df['pred_error'], errors='coerce')

    # Group by 'column_to_analyze' and calculate the average 'pred_error'
    column_variance_df = frames_df.groupby(column_to_analyze)['pred_error'].mean().reset_index()

    # Round to 2 decimal places
    column_variance_df['pred_error'] = round(column_variance_df['pred_error'], 2)

    # Sort by 'column_to_analyze' in ascending order
    column_variance_df = column_variance_df.sort_values(by=column_to_analyze, ascending=True)

    # Return DataFrame with results
    return column_variance_df

column_to_analyze = 'position'
# test_with_predictions_df

# Call the function
column_variance_df = find_column_variance(test_frames_concat_df, column_to_analyze)

# Print the DataFrame with results
print(f"Average Pred Error per {column_to_analyze}:")
print(column_variance_df)

NameError: name 'test_naive_df' is not defined

In [14]:
# Concatenate the frames DataFrames into a single large DataFrame
test_frames_concat_df = pd.concat(test_frames_dfs, ignore_index=True)

predict_two_seconds_naive_acceleration(test_frames_concat_df)

2.08

In [16]:
column_to_analyze = 'position'

# Call the function
column_variance_df = find_column_variance(test_frames_concat_df, column_to_analyze)

# Print the DataFrame with results
print(f"Average Pred Error per {column_to_analyze}:")
print(column_variance_df)

Average Pred Error per position:
               position  pred_error
0  Attacking Midfielder        2.07
1    Central Midfielder        2.05
2           Centre-Back        1.87
3  Defensive Midfielder        2.00
4               Forward        1.81
5             Full-Back        2.05
6            Goalkeeper        1.17
7               Unknown        1.84
8       Wide Midfielder        2.02
9                Winger        2.00
