# Playground for Exploring the Data

### Import packages

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from collections import OrderedDict
from datetime import datetime

import pandas as pd
import numpy as np
import random
import json
import glob
import os

from utils import load_processed_frames, prepare_LSTM_input_data
from settings import *

2024-05-10 16:30:32.042970: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## Load frames

In [2]:
# Load the frames
test_id = 'd762199f-8457-4066-b744-09e115f6884d'
frames_df = load_processed_frames(match_id=test_id)[0]

### Start Playing Around

### Store as xlsx

In [None]:
# Store frames_df as xslx
frames_df_head = frames_df.head(19979)

# Specify the file path for the Excel file
excel_file_path = f"{DATA_LOCAL_FOLDER}/Brommapojkarna_vs_Sirius.xlsx"

# Write the DataFrame to an Excel file
frames_df_head.to_excel(excel_file_path, index=False)

print(f"DataFrame saved to {excel_file_path}")

### Extract all unique player names

In [None]:
# Initialize a set to store unique player names along with their teams
player_names = set()

# Iterate through each game DataFrame
for frames_df in frames_list:
    # Extract unique player names and their teams
    players = frames_df[['player', 'team_name']].drop_duplicates()
    
    # Update the set of unique player names
    player_names.update(zip(players['player'], players['team_name']))

# Convert to a DataFrame
players_df = pd.DataFrame(list(player_names), columns=['Player', 'Team'])

# Sort values
players_df = players_df.sort_values(by=['Player', 'Team'], ascending=[True, True])

# Store as xlsx
players_df.to_excel(f"{DATA_LOCAL_FOLDER}/data/players/Players_2023.xlsx", index=False)

players_df

## Only used buildup

In [None]:
# Read the file as a DataFrame
build_up_events_df = pd.read_csv(f"{DATA_LOCAL_FOLDER}/data/buildup_events_2023.csv")
build_up_df = pd.read_csv(f"{DATA_LOCAL_FOLDER}/data/buildup_synced_2023.csv")

In [None]:
# build_up_events_df.iloc[0:20]
# for column in build_up_events_df.columns:
#     print(column)
possession_cols = [
    'possession_set_piece_attack',
    'possession_attack',
    'possession_free_kick',
    'possession_corner',
    'possession_throw_in',
    'possession_transition_low',
    'possession_free_kick_cross',
    'possession_transition_high',
    'possession_transition_medium',
    'possession_counterattack',
    'possession_direct_free_kick',
    'possession_penalty'
]

build_up_ev_ef = build_up_events_df.copy()

build_up_ev_ef = build_up_ev_ef[build_up_ev_ef['first_event']]
build_up_ev_ef['possession_duration']  = (np.floor(build_up_ev_ef['possession_duration'])).astype(int)
build_up_ev_ef['match_time_event_start'] = build_up_ev_ef['match_time']
build_up_ev_ef['match_time_event_end'] = build_up_ev_ef['match_time'] + build_up_ev_ef['possession_duration']
build_up_ev_ef[['match_id', 'minute', 'second', 'match_time_event_start', 'match_time_event_end','possession_duration']]
build_up_ev_ef = build_up_ev_ef[build_up_ev_ef['match_id'] == 5420660]
build_up_ev_ef[possession_cols + ['minute', 'second', 'match_time_event_start', 'match_time_event_end','possession_duration']]

In [None]:
# Group by 'match_id' and sum 'possession_duration' for each group
match_possession_duration = build_up_ev_ef.groupby('match_id')['possession_duration'].sum().reset_index()

# Calculate the average possession duration
average_possession_duration = match_possession_duration['possession_duration'].mean()

# Display the average possession duration
print("Average Possession Duration:", average_possession_duration)

In [None]:
for column in build_up_events_df.columns:
    print(column)

## Use a smaller frames_df

In [3]:
import old_utils as old
sequence_length=10
numerical_cols=['x', 'y', 'v_x', 'v_y', 'a_x', 'a_y', 'distance_to_ball', 'tiredness']
categorical_cols=['position']
positions=['Attacking Midfielder', 'Central Midfielder', 'Centre-Back', 'Defensive Midfielder', 'Forward', 'Full-Back', 'Goalkeeper', 'Wide Midfielder', 'Winger']
downsampling_factor = 5

In [4]:
small_df = frames_df
small_df = small_df[small_df['frame'] % 5 == 0]
small_df = small_df[small_df['position'] == 'Central Midfielder']#.iloc[10:20]
small_df['x'] = round(small_df['x'], 1)
small_df['y'] = round(small_df['y'], 1)
small_df['v_x'] = round(small_df['v_x'], 1)

numerical_cols = ['x', 'y', 'v_x']
small_df['numerical_cols_list'] = small_df[numerical_cols].values.tolist()
# small_df[['player', 'team_name', 'frame', 'x', 'y', 'v_x', 'x_future', 'y_future', 'can_be_sequentialized']]

### Optimize add_can_be_seq()

In [9]:
def add_can_be_sequentialized(frames_df, sequence_length, downsampling_factor):
    # Initialize vector
    frames_df['can_be_sequentialized'] = False

    # Create temporary vectors with the expected frame for each sequence step
    for i in range(sequence_length):
        frames_df[f'sequence_step_{i}'] = frames_df['frame'] - i * downsampling_factor

    # Group by each unique player
    grouped = frames_df.groupby(['team', 'jersey_number', 'match_id'])

    # Iterate through each group and find if we can create sequences
    for _, group in grouped:
        # Convert the frame column to a set for efficient lookups
        frame_set = set(group['frame'])

        # Check if all sequence steps exist
        group['temp_sequential'] = group.apply(
            lambda x: all((x[f'sequence_step_{i}'] in frame_set) for i in range(sequence_length)),
            axis=1
        )

        # Update the main DataFrame
        frames_df.loc[group.index, 'can_be_sequentialized'] = group['temp_sequential']

    # Drop temporary columns
    frames_df.drop(columns=[f'sequence_step_{i}' for i in range(sequence_length)], inplace=True)

    return frames_df

# Add a vector indicating if the row can be sequentialized, i.e. the player has 'sequence_length' consecutive frames
def add_can_be_sequentialized_opt(frames_df, sequence_length, downsampling_factor):
    # Initialize vector
    frames_df['can_be_sequentialized'] = False

    # Create temporary vectors with the expexted frame for each sequence step
    for i in range(sequence_length):
        frames_df[f'sequence_step_{i}'] = frames_df['frame'] - i * downsampling_factor

    # Group by each unique player
    grouped = frames_df.groupby(['team', 'jersey_number', 'match_id'])

    # Iterate through each player and find if we can create sequences
    for _, group in grouped:
        # Convert the frame column to a set for efficient lookups
        frame_set = set(group['frame'])

        # Create temporary columns indicating if each step in the sequences exists
        for i in range(sequence_length):
            group[f'sequence_step_exists_{i}'] = group[f'sequence_step_{i}'].isin(frame_set)

        # Aggregate 'sequence_step_exists_' checks to set 'can_be_sequentialized'
        sequence_steps_exist_cols = [f'sequence_step_exists_{i}' for i in range(sequence_length)]
        group['can_be_sequentialized'] = group[sequence_steps_exist_cols].all(axis=1)

        # Update the main DataFrame
        frames_df.loc[group.index, 'can_be_sequentialized'] = group['can_be_sequentialized']
    
    # Drop temporary columns
    frames_df.drop(columns=[f'sequence_step_{i}' for i in range(sequence_length)], inplace=True)

    return frames_df

In [14]:
frames_df = frames_df[frames_df['frame'] % 5 == 0]

In [17]:
frames_df.equals(frames_df_opt)

True