# Playground for Exploring the Data

### Import packages

In [21]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from collections import OrderedDict
from datetime import datetime

import pandas as pd
import numpy as np
import random
import json
import glob
import os

from utils import load_processed_frames, split_match_ids, prepare_LSTM_input_data
from add_features import add_velocity_xy
from settings import *

## Load frames

In [22]:
# Load the frames
test_id = '2bc54dd0-f030-11ec-b6f2-f966be93d878'
frames_df = load_processed_frames(match_id=test_id)[0]

In [23]:
frames_df.columns

Index(['team', 'team_name', 'team_direction', 'jersey_number', 'player',
       'role', 'distance_ran', 'x', 'y', 'frame', 'minute', 'second', 'period',
       'events', 'objects_tracked', 'v_x', 'v_y', 'a_x', 'a_y', 'orientation',
       'ball_in_motion', 'distance_to_ball', 'angle_to_ball', 'offside',
       'distance_to_onside', 'nationality', 'height', 'weight', 'acc', 'pac',
       'sta', 'position', 'specific_position', 'tiredness', 'tiredness_short',
       'x_future_25', 'y_future_25', 'x_future_50', 'y_future_50', 'x_future',
       'y_future', 'x_future_75', 'y_future_75', 'match_id', 'v_x_avg',
       'v_y_avg', 'age'],
      dtype='object')

## Start Playing Around

### Store as xlsx

In [None]:
# Store frames_df as xslx
frames_df_head = frames_df.head(19979)

# Specify the file path for the Excel file
excel_file_path = f"{DATA_LOCAL_FOLDER}/Brommapojkarna_vs_Sirius.xlsx"

# Write the DataFrame to an Excel file
frames_df_head.to_excel(excel_file_path, index=False)

print(f"DataFrame saved to {excel_file_path}")

### Extract all unique player names

In [None]:
# Initialize a set to store unique player names along with their teams
player_names = set()

# Split the match_ids and concatenate them into one list with all match_ids
match_ids = sum(split_match_ids(560), [])

# Iterate through each game
for match_id in match_ids:
    # Load the frames for the match
    frames_list = load_processed_frames(match_id=match_id)
    
    # If we managed to load a DataFrame
    if frames_list:
        # Extract unique player names and their teams
        players = frames_list[0][['player', 'team_name']].drop_duplicates()
        
        # Update the set of unique player names
        player_names.update(zip(players['player'], players['team_name']))

# Convert to a DataFrame
players_df = pd.DataFrame(list(player_names), columns=['Player', 'Team'])

# Sort values
players_df = players_df.sort_values(by=['Player', 'Team'], ascending=[True, True])

# Store as xlsx
players_df.to_excel(f"{DATA_LOCAL_FOLDER}/data/players/Signality_players.xlsx", index=False)

players_df

### Only used buildup (this was never implemented)

In [None]:
# Read the file as a DataFrame
build_up_events_df = pd.read_csv(f"{DATA_LOCAL_FOLDER}/data/buildup_events_2023.csv")
build_up_df = pd.read_csv(f"{DATA_LOCAL_FOLDER}/data/buildup_synced_2023.csv")

In [None]:
# build_up_events_df.iloc[0:20]
# for column in build_up_events_df.columns:
#     print(column)
possession_cols = [
    'possession_set_piece_attack',
    'possession_attack',
    'possession_free_kick',
    'possession_corner',
    'possession_throw_in',
    'possession_transition_low',
    'possession_free_kick_cross',
    'possession_transition_high',
    'possession_transition_medium',
    'possession_counterattack',
    'possession_direct_free_kick',
    'possession_penalty'
]

build_up_ev_ef = build_up_events_df.copy()

build_up_ev_ef = build_up_ev_ef[build_up_ev_ef['first_event']]
build_up_ev_ef['possession_duration']  = (np.floor(build_up_ev_ef['possession_duration'])).astype(int)
build_up_ev_ef['match_time_event_start'] = build_up_ev_ef['match_time']
build_up_ev_ef['match_time_event_end'] = build_up_ev_ef['match_time'] + build_up_ev_ef['possession_duration']
build_up_ev_ef[['match_id', 'minute', 'second', 'match_time_event_start', 'match_time_event_end','possession_duration']]
build_up_ev_ef = build_up_ev_ef[build_up_ev_ef['match_id'] == 5420660]
build_up_ev_ef[possession_cols + ['minute', 'second', 'match_time_event_start', 'match_time_event_end','possession_duration']]

In [None]:
# Group by 'match_id' and sum 'possession_duration' for each group
match_possession_duration = build_up_ev_ef.groupby('match_id')['possession_duration'].sum().reset_index()

# Calculate the average possession duration
average_possession_duration = match_possession_duration['possession_duration'].mean()

# Display the average possession duration
print("Average Possession Duration:", average_possession_duration)

### Use a smaller frames_df

In [None]:
small_df = frames_df
small_df = small_df[small_df['frame'] % 5 == 0]
small_df = small_df[small_df['team_name'] == 'Djurgården']
small_df = small_df[small_df['position'] == 'Winger']
small_df = small_df[small_df['player'] == 'Joel Asoro']

## Usain Bolt
Play around to see how often player hit Usian Bolt's speed

In [None]:
match_ids = sum(split_match_ids(560), [])
usain_bolts_speed = 0
denemonator = 0

# Iterate through each game
for match_id in match_ids:
    # Load the frames for the match
    frames_list = load_processed_frames(match_id=match_id)
    
    # If we managed to load a DataFrame
    if frames_list:
        # Add velocity without smoothing
        frames_df = add_velocity_xy(frames_list[0], smooth=False)

        frames_df['v_abs'] = np.sqrt(frames_df['v_x'] ** 2 + frames_df['v_y'] ** 2)

        usain_bolts_speed += len(frames_df[frames_df['v_abs'] >= 13])
        denemonator += 1

### Missing FM data
Find matches with missing FM data

In [None]:
match_ids = sum(split_match_ids(560), [])
fm_columns = ['player', 'team', 'position', 'specific_position', 'nationality', 'height', 'weight', 'acc', 'pac', 'sta']
ids_with_missing_data = []

# Iterate through each game
for match_id in match_ids:
    # Load the frames for the match
    frames_list = load_processed_frames(match_id=match_id)
    
    # If we managed to load a DataFrame
    if frames_list:
        # Add velocity without smoothing
        frames_df = frames_list[0]

        # Check for Null values in any of the fm_columns
        if frames_df[fm_columns].isnull().any().any():
            # Append the match_id if there are Null values
            ids_with_missing_data.append(match_id)

ids_with_missing_data