# Notebook for training predictive models
### Import packages

In [1]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, BatchNormalization, Dropout, Reshape, LSTM
from tensorflow.keras.models import Model, load_model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import date

import tensorflow as tf
import pandas as pd
import numpy as np
import random
import glob
import os

from utils import load_processed_frames, split_match_ids, get_next_model_filename, euclidean_distance_loss, total_error_loss, define_regularizers, prepare_EL_input_data, create_embeddings, smooth_predictions_xy, run_model, evaluate_model
from visualize_game import visualize_training_results
from settings import *

2024-04-03 17:56:16.234227: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Global variables

In [2]:
# Define numerical, categorical, and y columns
numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y']
categorical_cols = ['team_direction', 'role']
y_cols = ['x_future', 'y_future']

# Define parameters for model training
n_epochs = 1
batch_size = 32
n_matches = 40
downsampling_factor = 5     # Keep every n:th frame
sequence_length = 1    # Sequence length for LSTM model

In [3]:
# Define denominators for normalization
denominators = {
    'x': pitch_length,
    'y': pitch_width,
    'v_y': 13,
    'v_y': 13,
    'a_y': 10,
    'a_y': 10,
    'acc': 20,
    'pac': 20,
    'sta': 20,
    'height': 2.10,
    'weight': 110,
    'distance_to_ball': round(np.sqrt((pitch_length**2 + pitch_width**2)), 2),
    'angle_to_ball': 360,
    'orientation': 360,
    'tiredness': 10,
    'minute': 45,
    'period': 2,
}

### Load frames

In [6]:
# Load every frames_df to a list
frames_dfs = load_processed_frames(n_matches=n_matches)

# Create an internal match_id for each game
match_ids = range(len(frames_dfs))

# Split match IDs into train, test, and validation sets
train_ids, test_ids, val_ids = split_match_ids(match_ids=match_ids)

# Select frames data for training, testing, and validation
train_frames_dfs = [frames_dfs[i] for i in train_ids]
test_frames_dfs = [frames_dfs[i] for i in test_ids]
val_frames_dfs = [frames_dfs[i] for i in val_ids]

## Predictive model 1
### NN with Embedding layers
Player-based model

In [8]:
# Define the neural network model with embeddings for categorical features.
def define_NN_model_with_embedding(numerical_input_shape, l1=0, l2=0):
    if categorical_cols:
        categorical_inputs, categorical_flats = create_embeddings(categorical_cols)  # Create embeddings
        numerical_input = Input(shape=(numerical_input_shape,), name='numerical_input')  # Numerical input
        concatenated_features = Concatenate()([*categorical_flats, numerical_input])  # Combine all features
        model_inputs = [*categorical_inputs, numerical_input]  # Model inputs
    else:
        numerical_input = Input(shape=(numerical_input_shape,), name='numerical_input')  # Numerical input
        concatenated_features = numerical_input  # Use only numerical input
        model_inputs = numerical_input  # Model inputs
    
    # Dense layers
    regularizer = define_regularizers(l1, l2)  # Set regularizer
    dense_layer_1 = Dense(64, activation='relu', kernel_regularizer=regularizer)(concatenated_features)
    dense_layer_2 = Dense(32, activation='relu', kernel_regularizer=regularizer)(dense_layer_1)

    output_layer = Dense(2)(dense_layer_2)  # Output layer for x_future and y_future
    model = Model(inputs=model_inputs, outputs=output_layer)  # Build model

    return model

def train_NN_model_with_embedding(train_frames_dfs, val_frames_dfs, positions=[], l1=0, l2=0, special_text=None):
    # Prepare inputs
    X_train_input, y_train = prepare_EL_input_data(train_frames_dfs, numerical_cols, categorical_cols, positions=positions)
    X_val_input, y_val = prepare_EL_input_data(val_frames_dfs, numerical_cols, categorical_cols, positions=positions)

    # Define the model
    model = define_NN_model_with_embedding(numerical_input_shape=len(numerical_cols), l1=l1, l2=l2)

    # Compile the model
    model.compile(optimizer='adam', loss=euclidean_distance_loss)

    # Train the model with the corrected input format
    history = model.fit(X_train_input, y_train, validation_data=(X_val_input, y_val), epochs=n_epochs, batch_size=batch_size, verbose=2)

    # Save the trained model to disk
    model_filename = get_next_model_filename("NN_model")
    model.save(model_filename)

    # Generate the corresponding txt filename
    output_txt_filename = os.path.splitext(model_filename)[0] + ".txt"

    # Write the output directly to the txt file
    with open(output_txt_filename, 'w') as f:
        # Write the some general info at the begging of the file
        today_date = date.today().strftime("%Y-%m-%d")
        f.write(f"{today_date}\n")
        f.write(f"epochs={n_epochs}\n")
        f.write(f"matches={n_matches}\n")
        f.write(f"numerical_cols={numerical_cols}\n")
        f.write(f"categorical_cols={categorical_cols}\n")
        f.write(f"positions={positions}\n")
        if l1 != 0: f.write(f"l1={l1}\n")
        if l2 != 0: f.write(f"l2={l2}\n")
        if special_text: f.write(f"{special_text}\n")

        # Write the training results
        f.write("\nTraining results:\n")
        for key, value in history.history.items():
            rounded_values = [round(v, 2) for v in value]
            f.write(f"{key}: {rounded_values}\n")

In [9]:
# # Train the NN model with embedding layers
# n_epochs = 5
# categorical_cols = ['position']
# # positions = ["Attacking Midfielder", "Central Midfielder", "Centre-Back", "Defensive Midfielder", "Forward", "Full-Back", "Goalkeeper", "Wide Midfielder", "Winger"]
# positions = ["Central Midfielder", "Winger"]
# # positions = ["Goalkeeper"]

# numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'distance_to_ball']
# train_NN_model_with_embedding(train_frames_dfs, val_frames_dfs, positions=positions)

# numerical_cols = ['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'distance_to_ball', 'minute', 'distance_ran', 'sta']
# train_NN_model_with_embedding(train_frames_dfs, val_frames_dfs, positions=positions)

### Test for different values

In [20]:
# model_names = ["NN_model_v2"]
# for model_name in model_names:
#     error = evaluate_model(test_frames_dfs, model_name, LSTM=False)
#     print(f"{model_name}: {error}")

# for i in range(2, 5):
#     model_name = f"NN_model_v{i}"
#     error = evaluate_model(test_frames_dfs, model_name, LSTM=False)
#     print(f"{model_name}: {error}\n")

# Test different alpha values

# frames_df = run_model(test_frames_dfs, "LSTM_model_v5", LSTM=True)
# alpha = 1.0
# for i in range(10):
#     frames_df = frames_df.copy()
#     smooth_predictions_xy(frames_df, alpha=alpha)
#     error = total_error_loss(frames_df)
#     print(f"{round(alpha, 2)}: {error}")
#     alpha -= 0.01

In [12]:
# Calculates how the average 'pred_error' varies with each value in 'column_to_analyze'
def find_column_variance(frames_df, column_to_analyze):
    # Convert 'pred_error' to numeric, coercing non-numeric values to NaN
    frames_df['pred_error'] = pd.to_numeric(frames_df['pred_error'], errors='coerce')

    # Group by 'column_to_analyze' and calculate the average 'pred_error'
    column_variance_df = frames_df.groupby(column_to_analyze)['pred_error'].mean().reset_index()

    # Round to 2 decimal places
    column_variance_df['pred_error'] = round(column_variance_df['pred_error'], 2)

    # Sort by 'column_to_analyze' in ascending order
    column_variance_df = column_variance_df.sort_values(by=column_to_analyze, ascending=True)

    # Return DataFrame with results
    return column_variance_df

# alpha = 1
# frames_df = run_model("NN_model_v3")
# total_error_loss(frames_df)
# column_to_analyze = 'position'

# # Call the function
# column_variance_df = find_column_variance(frames_df, column_to_analyze)

# # Print the DataFrame with results
# print(f"Average Pred Error per {column_to_analyze}:")
# print(column_variance_df)

## Predictive model 2
### LSTM model
Player-based model

In [13]:
# Add a vector indicating if the row can be sequentialized, i.e. the player has 'sequence_length' consecutive frames
def add_can_be_sequentialized(frames_df, sequence_length):
    # Calculate the expected sequence start frame
    frames_df['expected_sequence_start_frame'] = frames_df['frame'] - sequence_length * FPS // downsampling_factor
    
    # Group by each unique player
    grouped = frames_df.groupby(['team', 'jersey_number'])
    
    # For each player, shift the 'frame' column to identify potential sequences
    frames_df['shifted_frame'] = grouped['frame'].shift(sequence_length)
    
    # Check if the shifted frame matches 'expected_sequence_start_frame' and set 'can_be_sequentialized' to True if it does
    frames_df['can_be_sequentialized'] = frames_df['expected_sequence_start_frame'] == frames_df['shifted_frame']
    
    # Drop temporary columns
    frames_df.drop(['expected_sequence_start_frame', 'shifted_frame'], axis=1, inplace=True)

# Sequentialize the numerical and categorical columns
def sequentialize_data(X_df, y_df, numerical_cols, categorical_cols, sequence_length):
    # Initialize empty lists with sequentialized data
    X_seq_num_data = []
    X_seq_cat_data = []
    y_seq_data = []
    
    # Combined the values in y_df with X_df
    X_df['future_xy'] = y_df.values.tolist()

    # Add vector 'can_be_sequentialized'
    add_can_be_sequentialized(X_df, sequence_length=sequence_length)

    # Create a vector containg a list of all values in the numerical columns
    X_df['numerical_data_list'] = X_df[numerical_cols].values.tolist()
    
    # Create a similar list for the categorical columns, if any
    if categorical_cols:
        # X_df['categorical_data_list'] = X_df[categorical_cols].values.tolist()
        X_df['categorical_data_list'] = X_df[categorical_cols].apply(lambda x: x.tolist(), axis=1)

    # Sort the DataFrame by 'team', 'match_id', and most importantly 'player'
    X_df_sorted = X_df.sort_values(by=['team', 'match_id', 'player'])

    # Group by each unique player
    grouped = X_df_sorted.groupby(['team', 'jersey_number', 'match_id'])

    # Iterate through each player and create sequences
    for _, group in grouped:
        # Create temporary columns with shifted version of 'numerical_cols' and 'categorical_cols'
        for i in range(sequence_length):
            group['numerical_data_list_' + str(i)] = group["numerical_data_list"].shift(i)

        # Concatenate the termporary columns to create the column 'sequential_numerical_data'
        columns_to_sequentialize = ['numerical_data_list_' + str(i) for i in range(sequence_length)][::-1]
        group['sequential_numerical_data'] = group[columns_to_sequentialize].values.tolist()

        # Only consider rows that can be sequentialized
        group = group[group['can_be_sequentialized']]

        # Add the X data to the sequentialized lists
        X_seq_num_data.append(group['sequential_numerical_data'])
        
        if categorical_cols:
            X_seq_cat_data.append(group['categorical_data_list'])

        # Add the y data to the sequentialized lists
        y_seq_data.append(group['future_xy'])

    # Combine all the sequentialized data to create Series
    X_seq_num = pd.concat(X_seq_num_data)
    y_seq = pd.concat(y_seq_data)

    # Convert the Pandas Series of lists to a NumPy array
    X_seq_num_np = np.array(X_seq_num.tolist()).astype('float32')
    y_seq_np = np.array(y_seq.tolist()).astype('float32')
    
    # Add the data from categorical columns to X_seq_np
    if categorical_cols:
        X_seq_cat = pd.concat(X_seq_cat_data)
        X_seq_cat_np = np.array(X_seq_cat.tolist()).astype('float32')
        X_seq_np = [X_seq_cat_np, X_seq_num_np]

        return X_seq_np, y_seq_np
    
    # Return the resuls without adding categorical data
    else:
        return X_seq_num_np, y_seq_np

def prepare_LSTM_input_data(frames_dfs, numerical_cols, categorical_cols, sequence_length, positions=[]):
    # Definie columns to temporarely give to prepare_data()
    unchanged_cols=['player', 'frame', 'team', 'jersey_number', 'match_id']

    # Prepare data
    X_df, y_df = prepare_data(frames_dfs, numerical_cols=numerical_cols, categorical_cols=categorical_cols, unchanged_cols=unchanged_cols, positions=positions, include_ball=False, ball_has_to_be_in_motion=True)

    # Sequentialize the data
    X_seq, y_seq = sequentialize_data(X_df, y_df, numerical_cols, categorical_cols, sequence_length)

    return X_seq, y_seq

In [14]:
# Define the NN model with LSTM layer
def define_LSTM_model(numerical_input_shape, sequence_length, l1=0, l2=0):  
    # Handle case where we have categorical columns
    if categorical_cols:
        # Create embeddings for categorical data
        categorical_inputs, categorical_flats = create_embeddings(categorical_cols)
        
        # Input for numerical data
        numerical_input = Input(shape=(sequence_length, numerical_input_shape), name='numerical_input')

        # Processing sequence with LSTM
        lstm_out = LSTM(64)(numerical_input)

        # Assuming we want to concatenate LSTM output with categorical embeddings
        # Note: This might need adjustment based on how you want to use categorical data
        concatenated_features = Concatenate()([lstm_out] + categorical_flats)

        model_inputs = categorical_inputs + [numerical_input]

    # Handle case where we only have numerical columns3
    else:
        numerical_input = Input(shape=(sequence_length, numerical_input_shape), name='numerical_input')  # Numerical input
        lstm_layer = LSTM(64)(numerical_input)  # LSTM layer directly using numerical input
        concatenated_features = lstm_layer  # Directly use LSTM output
        model_inputs = [numerical_input]  # Model inputs

    # Dense layers
    regularizer = define_regularizers(l1, l2)  # Set regularizer
    dense_layer_1 = Dense(64, activation='relu', kernel_regularizer=regularizer)(concatenated_features)
    dense_layer_2 = Dense(32, activation='relu', kernel_regularizer=regularizer)(dense_layer_1)

    output_layer = Dense(2)(dense_layer_2)  # Output layer for x_future and y_future
    model = Model(inputs=model_inputs, outputs=output_layer)  # Build model

    return model

def train_LSTM_model(train_frames_dfs, val_frames_dfs, sequence_length, positions=[], l1=0, l2=0, special_text=None):
    # Prepare inputs
    X_train_input, y_train = prepare_LSTM_input_data(train_frames_dfs, numerical_cols, categorical_cols, sequence_length, positions=positions)
    X_val_input, y_val = prepare_LSTM_input_data(val_frames_dfs, numerical_cols, categorical_cols, sequence_length, positions=positions)

    # Define the model
    model = define_LSTM_model(numerical_input_shape=len(numerical_cols), sequence_length=sequence_length, l1=l1, l2=l2)

    # Compile the model
    model.compile(optimizer='adam', loss=euclidean_distance_loss)

    # Train the model with the corrected input format
    history = model.fit(X_train_input, y_train, validation_data=(X_val_input, y_val), epochs=n_epochs, batch_size=batch_size, verbose=2)

    # Save the trained model to disk
    model_filename = get_next_model_filename("LSTM_model")
    model.save(model_filename)

    # Generate the corresponding txt filename
    output_txt_filename = os.path.splitext(model_filename)[0] + ".txt"

    # Write the output directly to the txt file
    with open(output_txt_filename, 'w') as f:
        # Write the some general info at the begging of the file
        today_date = date.today().strftime("%Y-%m-%d")
        f.write(f"{today_date}\n")
        f.write(f"epochs={n_epochs}\n")
        f.write(f"matches={n_matches}\n")
        f.write(f"sequence_length={sequence_length}\n")
        f.write(f"numerical_cols={numerical_cols}\n")
        f.write(f"categorical_cols={categorical_cols}\n")
        f.write(f"positions={positions}\n")
        if l1 != 0: f.write(f"l1={l1}\n")
        if l2 != 0: f.write(f"l2={l2}\n")
        if special_text: f.write(f"{special_text}\n")

        # Write the training results
        f.write("\nTraining results:\n")
        for key, value in history.history.items():
            rounded_values = [round(v, 2) for v in value]
            f.write(f"{key}: {rounded_values}\n")

In [21]:
n_epochs = 4
positions=['Attacking Midfielder', 'Central Midfielder', 'Centre-Back', 'Defensive Midfielder', 'Forward', 'Full-Back', 'Goalkeeper', 'Wide Midfielder', 'Winger']
categorical_cols = []
sequence_length = 10

numerical_cols=['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'distance_to_ball']
train_LSTM_model(train_frames_dfs, val_frames_dfs, sequence_length, positions=positions)

Epoch 1/4
426189/426189 - 993s - loss: 1.7130 - val_loss: 1.5130 - 993s/epoch - 2ms/step
Epoch 2/4
426189/426189 - 996s - loss: 1.6162 - val_loss: 1.4690 - 996s/epoch - 2ms/step
Epoch 3/4
426189/426189 - 1004s - loss: 1.6021 - val_loss: 1.4677 - 1004s/epoch - 2ms/step
Epoch 4/4


In [38]:
model_name = "LSTM_model_v4"
error = evaluate_model(test_frames_dfs, model_name, LSTM=True)
print(f"{model_name}: {error}")

LSTM_model_v4: 1.331


### Test LSTM prepare data on smaller dataset

In [30]:
frames_df = val_frames_dfs[0]
frames_df = frames_df[frames_df['frame'] % 5 == 0]
frames_df = frames_df[frames_df['position'] == "Goalkeeper"].iloc[10:20]
frames_df['x'] = round(frames_df['x'], 1)
frames_df['y'] = round(frames_df['y'], 1)
frames_df['v_x'] = round(frames_df['v_x'], 1)

add_can_be_sequentialized(frames_df, sequence_length=2)

numerical_cols = ['x', 'y', 'v_x']
frames_df['numerical_cols_list'] = frames_df[numerical_cols].values.tolist()
frames_df[['player', 'frame', 'x', 'y', 'v_x', 'x_future', 'y_future', 'can_be_sequentialized']]

Unnamed: 0,player,frame,x,y,v_x,x_future,y_future,can_be_sequentialized
560,Peter Abrahamsson,25,10.4,33.5,-1.0,8.35,32.28,False
570,Noel Törnqvist,25,97.2,35.5,-0.2,95.2,34.93,False
670,Peter Abrahamsson,30,10.1,33.3,-1.0,8.18,32.22,False
680,Noel Törnqvist,30,97.1,35.5,-0.3,94.85,34.88,False
780,Peter Abrahamsson,35,9.9,33.1,-1.0,8.01,32.18,True
790,Noel Törnqvist,35,97.0,35.5,-0.3,94.44,34.85,True
890,Peter Abrahamsson,40,9.8,33.0,-0.8,7.79,32.15,True
900,Noel Törnqvist,40,97.0,35.4,-0.3,94.01,34.86,True
996,Peter Abrahamsson,45,9.6,32.9,-0.8,7.51,32.1,True
1006,Noel Törnqvist,45,96.9,35.3,-0.7,93.64,34.88,True


In [54]:
X_seq_data, y_seq_data = prepare_LSTM_input_data([frames_df], numerical_cols, categorical_cols, sequence_length=2, positions=positions)

In [55]:
X_seq_data

[array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], dtype=float32),
 array([[[ 0.9247619 ,  0.52205884, -0.3       ],
         [ 0.9238095 ,  0.52205884, -0.3       ]],
 
        [[ 0.9238095 ,  0.52205884, -0.3       ],
         [ 0.9238095 ,  0.5205882 , -0.3       ]],
 
        [[ 0.9238095 ,  0.5205882 , -0.3       ],
         [ 0.92285717,  0.51911765, -0.7       ]],
 
        [[ 0.09619047,  0.4897059 , -1.        ],
         [ 0.09428571,  0.4867647 , -1.        ]],
 
        [[ 0.09428571,  0.4867647 , -1.        ],
         [ 0.09333333,  0.4852941 , -0.8       ]],
 
        [[ 0.09333333,  0.4852941 , -0.8       ],
         [ 0.09142857,  0.48382354, -0.8       ]]], dtype=float32)]

### Visualize training results

In [None]:
# # Visualize training results
# model_name = 'NN_embedding_model_3'
# training_results = {
#     'loss': [2.0478146076202393, 2.0088889598846436, 2.0007753372192383, 1.9968146085739136, 1.9937269687652588, 1.9921172857284546, 1.990675687789917, 1.9893001317977905, 1.9881930351257324, 1.9875684976577759, 1.9872304201126099, 1.9865171909332275, 1.9859004020690918, 1.985435128211975, 1.9848004579544067, 1.983401894569397, 1.9824390411376953, 1.9820188283920288, 1.981824517250061, 1.9817743301391602],
#     'val_loss': [4.535243034362793, 4.51762580871582, 4.469428539276123, 4.436275482177734, 4.456634521484375, 4.815524578094482, 4.3103556632995605, 4.498797416687012, 4.790141582489014, 4.464589595794678, 4.674554347991943, 4.561259746551514, 4.533383369445801, 4.472135066986084, 4.466953754425049, 4.478504180908203, 4.723540782928467, 4.859069347381592, 4.496937274932861, 4.377903461456299]
# }

# visualize_training_results(training_results, model_name)