# Notebook for training predictive models
### Import packages

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
import random
import glob
import os

from settings import *

### Helper functions

In [2]:
# Split the games into train, test, and validtion. This way, each game will be treated seperatly
def split_match_ids(match_ids, train_size=0.7, test_size=0.1, val_size=0.2, random_state=42):
    # Calculate the remaining size after the test and validation sizes are removed
    remaining_size = 1.0 - train_size

    # Check if the sum of sizes is not equal to 1
    if remaining_size < 0 or abs(train_size + test_size + val_size - 1.0) > 1e-6:
        raise ValueError("The sum of train_size, test_size, and val_size must be equal to 1.")
    
    # Split the match IDs into train, test, and validation sets
    train_ids, remaining_ids = train_test_split(match_ids, train_size=train_size, random_state=random_state)
    val_ids, test_ids = train_test_split(remaining_ids, test_size=test_size / remaining_size, random_state=random_state)
    
    return train_ids, test_ids, val_ids
    
# Get the latest model, for a given 'model_name', based on the number of current models
def get_latest_model_filename(model_name):
    models_folder = "./models/"

    # Get a list of existing model filenames in the models folder
    existing_models = [filename for filename in os.listdir(models_folder) if filename.startswith(model_name) and filename.endswith('.h5')]

    # Sort the existing models by name
    existing_models.sort()

    if existing_models:
        # Get the latest model filename
        latest_model_filename = existing_models[-1]
    else:
        ValueError(f"No existing model available with the given name: models/{model_name}_XX.h5")

    return os.path.join(models_folder, latest_model_filename)

### Load frames

In [3]:
# Load the processed/frames
def load_all_processed_frames():
    # Create DataFrame for storing all frames
    frames_dfs = []
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_PROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_PROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_PROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)
                
                # Append the DataFrame to frames_dfs
                frames_dfs.append(frames_df)

    return frames_dfs

# Load every frames_df to a list
frames_dfs = load_all_processed_frames()

# Create an internal match_id for each game
match_ids = range(len(frames_dfs))

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
train_frames_dfs = [frames_dfs[i] for i in train_ids]
test_frames_dfs = [frames_dfs[i] for i in test_ids]
val_frames_dfs = [frames_dfs[i] for i in val_ids]

### Load predictive models

In [None]:
# NAIVE: Always predict that all players will stand still
# The calculations are based on x, y
def predict_two_seconds_naive_static(frames_df):
    frames_df['x_future_pred'] = frames_df['x']
    frames_df['y_future_pred'] = frames_df['y']

# NAIVE: Always predict that all players will continue with the same velocity
# The calculations are based on x, y, v_x, and v_y
def predict_two_seconds_naive_velocity(frames_df):
    frames_df['x_future_pred'] = frames_df['x'] + frames_df['v_x'] * seconds_into_the_future
    frames_df['y_future_pred'] = frames_df['y'] + frames_df['v_y'] * seconds_into_the_future

# Make a prediction with a LSTM neural network model
def predict_two_seconds_LSTM(frames_df):
    return 

### Functions for calculating error

In [None]:
# Add a column for distance wrongly predicted (in metres) for each object
def add_pred_error(frames_df):
    # Create a vector with the Eculidian distance between the true position and the predicted position
    frames_df['pred_error'] = round(((frames_df['x_future_pred'] - frames_df['x_future'])**2 + (frames_df['y_future_pred'] - frames_df['y_future'])**2)**0.5, 2)
    
# Add a column for distance wrongly predicted (in metres) for each object. Also return average_pred_error
def total_error_loss(frames_df, include_ball=False, ball_has_to_be_in_motion=True):
    # Add 'pred_error' column if empty
    if frames_df['pred_error'].empty:
        add_pred_error(frames_df)
    
    # Create a new column to store modified pred_error values
    frames_df['pred_error_tmp'] = frames_df['pred_error']
    
    # If specified, set pred_error to None for frames where the ball is not in motion
    if ball_has_to_be_in_motion:
        frames_df.loc[frames_df["ball_in_motion"] != True, 'pred_error_tmp'] = None

    # If specified, set pred_error to None for rows where 'team' is 'ball'
    if not include_ball:
        frames_df.loc[frames_df['team'] == 'ball', 'pred_error_tmp'] = None

    # Calculate average pred_error_tmp, excluding rows where pred_error is None
    average_pred_error = frames_df['pred_error_tmp'].mean()

    # Drop the temporary column
    frames_df.drop(columns=['pred_error_tmp'], inplace=True)

    return round(average_pred_error, 2)

# Calculate the average error for a list of games
def calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion):
    # Predict the future positions
    [predict_function(frames_df) for frames_df in frames_dfs]
    [add_pred_error(frames_df) for frames_df in frames_dfs]

    # Concatenate all frames dataframes into a single dataframe
    concatted_frames_df = pd.concat(frames_dfs)    
    
    # Calculate the total error loss
    error = total_error_loss(concatted_frames_df, include_ball, ball_has_to_be_in_motion)
    
    return error

# Find a frame with approximatly the same error as the average_pred_error, with an interval
def find_frame_with_average_error(frames_df, average_pred_error, error_margin):
    # For all frames
    frames = frames_df['frame'].unique()
    for frame in frames:
        current_error = frames_df[frames_df['frame'] == frame]['pred_error'].mean()
        # If the current error is within the error_margin,
        if (current_error >= average_pred_error - error_margin) and (current_error <= average_pred_error + error_margin):
            # Return the result
            return frame

    # If no frame was found
    print(f"No frame found within the error margin of {error_margin}")
    return None

## Evaulate models
### Evaluate the NAIVE models

In [None]:
# Test model 1
error_naive_static = calculate_average_error(test_frames_dfs, predict_two_seconds_naive_static, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive static: {error_naive_static}")

# Test model 2
error_naive_velocity = calculate_average_error(test_frames_dfs, predict_two_seconds_naive_velocity, include_ball=False, ball_has_to_be_in_motion=True)
print(f"Model error naive velocity: {error_naive_velocity}")

# Visualize one frame with average error for the first model, together with the corresponding frame for the second model
frames_df = test_frames_dfs[0]
frame_with_average_error = find_frame_with_average_error(frames_df, error_naive_static, error_margin=0.1)
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_static")
visualize_frame_prediction(frames_df, frame_with_average_error, "naive_velocity")

### Evaulate the NAIVE models with different parameters

In [None]:
# Define the prediction functions (models) you want to test
prediction_functions = {
    "Naive Static": predict_two_seconds_naive_static,
    "Naive Velocity": predict_two_seconds_naive_velocity
}

# Initialize an empty list to store the results
results = []

# Define the combinations of include_ball and ball_has_to_be_in_motion
combinations = [(True, True), (True, False), (False, True), (False, False)]

# Load all processed frames
frames_dfs = load_all_processed_frames()

# Loop through each combination
for include_ball, ball_has_to_be_in_motion in combinations:
    # Add the combination of parameters
    result = {"Include Ball": include_ball, "Ball in Motion": ball_has_to_be_in_motion}
    # Loop through each prediction function (model)
    for model_name, predict_function in prediction_functions.items():
        # Calculate average error for the current prediction function (model)
        avg_error = calculate_average_error(frames_dfs, predict_function, include_ball, ball_has_to_be_in_motion)
        result[model_name] = avg_error
    
    # Append the results to the list
    results.append(result)

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results)

# Print the resulting DataFrame
results_df