### Import packages

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import glob
import os

from add_features import add_xy_future, add_velocity_xy, add_acceleration_xy, add_average_velocity, add_orientation, add_ball_in_motion, add_distance_to_ball, add_angle_to_ball, add_offside, add_distance_to_onside, load_FM_data, add_FM_data, add_tiredness, add_tiredness_short_term
from utils import google_sheet_to_df, load_processed_frames
from settings import *

2024-05-08 11:52:51.294885: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## Features explanation

### Tiredness
The tiredness of a player can be calculated using the formula:
$$\text{Tiredness} = \left( \frac{\text{distance\_ran}}{1000} + \frac{\text{minute}}{20} + \text{period} - 1 \right) \times \left( 1 - \frac{\text{sta}}{20} \right)$$
where $\text{distance\_ran}$ is the distance ran in meters, $\text{minute}$ is the minute of the game, $\text{minute}$ is the period of the game, and $\text{sta}$ is the player's stamina rated from 1 to 20.

Explanation of formula:
For every kilometer the player runs, every 20th minute that passes, and every frame in the second half, the tiredness is increase by 1. Everything is than scaled based on the stamina of the player

## Functions for processing frames

In [None]:
# Process the unprocessed/ frames, and store the results to the processed/ fodler
def process_frames():
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_UNPROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/unprocessed"
            FOLDER_OUT = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"
            
            # Create output folder if not exists
            if not os.path.exists(FOLDER_OUT):
                    os.makedirs(FOLDER_OUT)

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_UNPROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # Load Football Manager data
            fm_players_df = load_FM_data()

            # For all matches
            for match_id in match_ids:
                # Skip if processed game already exists, if specified
                if not reload_data and Path(f"{FOLDER_OUT}/{match_id}.parquet").exists():
                    print(f"Match {match_id} already processed. Skipping...")
                    continue  # Skip to the next match

                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_UNPROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)

                # Process frames_df
                frames_df = add_xy_future(frames_df, FPS * seconds_into_the_future)
                frames_df = add_velocity_xy(frames_df, 1, smooth=True)
                frames_df = add_acceleration_xy(frames_df, 1, smooth=True)
                frames_df = add_average_velocity(frames_df)
                frames_df = add_orientation(frames_df)
                frames_df = add_ball_in_motion(frames_df)
                frames_df = add_distance_to_ball(frames_df)
                frames_df = add_angle_to_ball(frames_df)
                # frames_df = add_offside(frames_df)
                frames_df = add_distance_to_onside(frames_df)
                frames_df = add_FM_data(frames_df, fm_players_df)
                frames_df = add_tiredness(frames_df)
                frames_df = add_tiredness_short_term(frames_df, window=FPS*20)

                # Add match_id
                frames_df["match_id"] = match_id

                # Convert DataFrame to a parquet file
                frames_df.to_parquet(f"{FOLDER_OUT}/{match_id}.parquet")

                # Print that the match is processed
                print(f"Match {match_id} is processed")

# Takes the processed frames and add more features
def add_data_to_processed_frames():
    # Define the paths
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define the paths
            FOLDER_OUT = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all processed frames parquet files
            processed_paths = glob.glob(os.path.join(FOLDER_OUT, "*.parquet"))

            # Load Football Manager data
            fm_players_df = load_FM_data()

            # For each processed frame
            for processed_path in processed_paths:
                # Load the processed DataFrame
                frames_df = pd.read_parquet(processed_path)

                # Perform the operation to add data to the processed frames
                # For example:
                # frames_df = add_additional_data(frames_df)
                frames_df = add_FM_data(frames_df, fm_players_df)

                # Save the updated DataFrame back to the same file
                frames_df.to_parquet(processed_path)

                # Print that the match is processed
                match_id = processed_path.replace(FOLDER_OUT, '').strip('/').rstrip('.parquet')
                print(f"Match {match_id} is processed")

# Process and load frames
process_frames()
# frames_dfs = load_processed_frames(n_matches=1)
# add_data_to_processed_frames()

## Find missing FM players

In [12]:
from utils import split_match_ids
train_ids, test_ids, val_ids = split_match_ids(280)

In [24]:
test_id = 'd9c8a786-f725-432d-b288-4ae741cd2124'
frames_df = load_processed_frames(match_id=test_id)[0]
# Print the first row where 'nationality' is NaN
frames_df[frames_df['nationality'].isna()]

Unnamed: 0,team,team_name,team_direction,jersey_number,player,role,distance_ran,x,y,frame,...,nationality,height,weight,acc,pac,sta,position,tiredness,tiredness_short,match_id
2503787,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,0.00,53.94,46.50,111337,...,,,,,,,,,0.00,d9c8a786-f725-432d-b288-4ae741cd2124
2503807,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,0.02,53.95,46.51,111338,...,,,,,,,,,0.00,d9c8a786-f725-432d-b288-4ae741cd2124
2503827,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,0.04,53.96,46.53,111339,...,,,,,,,,,0.39,d9c8a786-f725-432d-b288-4ae741cd2124
2503847,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,0.07,53.97,46.55,111340,...,,,,,,,,,0.43,d9c8a786-f725-432d-b288-4ae741cd2124
2503867,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,0.09,53.98,46.58,111341,...,,,,,,,,,0.51,d9c8a786-f725-432d-b288-4ae741cd2124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3233201,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,2403.69,76.04,46.92,144153,...,,,,,,,,,1.77,d9c8a786-f725-432d-b288-4ae741cd2124
3233224,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,2403.73,76.01,46.93,144154,...,,,,,,,,,1.76,d9c8a786-f725-432d-b288-4ae741cd2124
3233247,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,2403.76,75.97,46.94,144155,...,,,,,,,,,1.76,d9c8a786-f725-432d-b288-4ae741cd2124
3233269,home_team,IF Brommapojkarna,right,19,Leonard Zuta,0,2403.79,75.94,46.95,144156,...,,,,,,,,,1.76,d9c8a786-f725-432d-b288-4ae741cd2124


In [28]:
# Lets find if we have None value in the 'position' column for any of the match_ids
missing_match_ids = []
for match_id in val_ids:
    # if train_id not in ['dc21fa8f-6c34-4b5a-a4a7-fc94d5242376']:
    frames_dfs = load_processed_frames(match_id=match_id)

    if len(frames_dfs) > 0:
        # Check if there are any None values in the 'position' column
        if frames_dfs[0]['position'].isnull().any():
            print(f"Error in match_id: {match_id}")
            missing_match_ids.append(match_id)

In [25]:
# Define paths
DATA_FOLDER_UNPROCESSED = f"{DATA_LOCAL_FOLDER}/data/{2023}/{'Allsvenskan'}/unprocessed"
FOLDER_OUT = f"{DATA_LOCAL_FOLDER}/data/{2023}/{'Allsvenskan'}/processed"
match_ids = missing_match_ids

# Load Football Manager data
fm_players_df = load_FM_data()

# For all matches
for match_id in match_ids:
    # Skip if processed game already exists, if specified
    if not reload_data and Path(f"{FOLDER_OUT}/{match_id}.parquet").exists():
        print(f"Match {match_id} already processed. Skipping...")
        continue  # Skip to the next match

    # Convert parquet file to a DataFrame
    file_path_match = f"{DATA_FOLDER_UNPROCESSED}/{match_id}.parquet"
    frames_df = pd.read_parquet(file_path_match)

    # Process frames_df
    frames_df = add_xy_future(frames_df, FPS * seconds_into_the_future)
    frames_df = add_velocity_xy(frames_df, 1, smooth=True)
    frames_df = add_acceleration_xy(frames_df, 1, smooth=True)
    frames_df = add_average_velocity(frames_df)
    frames_df = add_orientation(frames_df)
    frames_df = add_ball_in_motion(frames_df)
    frames_df = add_distance_to_ball(frames_df)
    frames_df = add_angle_to_ball(frames_df)
    # frames_df = add_offside(frames_df)
    frames_df = add_distance_to_onside(frames_df)
    frames_df = add_FM_data(frames_df, fm_players_df)
    frames_df = add_tiredness(frames_df)
    frames_df = add_tiredness_short_term(frames_df, window=FPS*20)

    # Add match_id
    frames_df["match_id"] = match_id

    # Convert DataFrame to a parquet file
    frames_df.to_parquet(f"{FOLDER_OUT}/{match_id}.parquet")

    # Print that the match is processed
    print(f"Match {match_id} is processed")

Match 36fc9209-61bb-45b2-9d52-f05a3c03c23e is processed
Match 624e550b-930f-4d32-9f2f-2bd1b2a6c390 is processed
Match 8f87325f-bc76-4779-aadb-75bd19a71847 is processed
Match d9c8a786-f725-432d-b288-4ae741cd2124 is processed
