# Playground for Exploring the Data

### Import packages

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from collections import OrderedDict
from datetime import datetime

import pandas as pd
import numpy as np
import random
import json
import glob
import os

from settings import *

## Load files

In [2]:
# Load the processed/frames
def load_all_processed_frames():
    # Create DataFrame for storing all frames
    frames_dfs = []
    # Load frames_df
    for selected_season in seasons:
        for selected_competition in competitions:
            # Define paths
            DATA_FOLDER_PROCESSED = f"{DATA_LOCAL_FOLDER}/data/{selected_season}/{selected_competition}/processed"

            # Find all frames parquet files
            match_paths = glob.glob(os.path.join(DATA_FOLDER_PROCESSED, "*.parquet"))

            # Extract IDs without the ".parquet" extension
            match_ids = [os.path.splitext(os.path.basename(path))[0] for path in match_paths][0:60]
            # match_ids = ['49e6bfdf-abf3-499d-b60e-cf727c6523c1']

            # For all matches
            for match_id in match_ids:
                # Convert parquet file to a DataFrame
                file_path_match = f"{DATA_FOLDER_PROCESSED}/{match_id}.parquet"
                frames_df = pd.read_parquet(file_path_match)
                
                # Append the DataFrame to frames_dfs
                frames_dfs.append(frames_df)

    return frames_dfs

# Load every frames_df to a list
frames_dfs = load_all_processed_frames()

### Start Playing Around

### Store as xlsx

In [None]:
# Store frames_df as xslx
frames_df_head = frames_df.head(19979)

# Specify the file path for the Excel file
excel_file_path = f"{DATA_LOCAL_FOLDER}/Brommapojkarna_vs_Sirius.xlsx"

# Write the DataFrame to an Excel file
frames_df_head.to_excel(excel_file_path, index=False)

print(f"DataFrame saved to {excel_file_path}")

### Extract all unique player names

In [3]:
# Initialize a set to store unique player names along with their teams
player_names = set()

# Iterate through each game DataFrame
for frames_df in frames_dfs:
    # Extract unique player names and their teams
    players = frames_df[['player', 'team_name']].drop_duplicates()
    
    # Update the set of unique player names
    player_names.update(zip(players['player'], players['team_name']))

# Convert to a DataFrame
players_df = pd.DataFrame(list(player_names), columns=['Player', 'Team'])

# Sort values
players_df = players_df.sort_values(by=['Player', 'Team'], ascending=[True, True])

# Store as xlsx
players_df.to_excel(f"{DATA_LOCAL_FOLDER}/data/players/Players_2023.xlsx", index=False)

players_df

Unnamed: 0,Player,Team
114,Abdelkarim Mammar Chaouche,Degerfors IF
176,Abdelrahman Boudah Saidi,Hammarby
411,Abdelrahman Saidi,Hammarby
96,Abdihakin Ali,AIK
145,Abdussalam Magashy,AIK
...,...,...
407,Yassine El Ouatki,Varbergs BoIS FC
390,Zachary Elbouzedi,AIK
243,Zeidane Inoussa,IF Brommapojkarna
165,ball,ball


In [6]:
# Initialize a set to store unique player names along with their teams
roles = set()

# Iterate through each game DataFrame
for frames_df in frames_dfs:
    # Extract unique player names and their teams
    current_roles = frames_df['role'].unique()
    
    # Update the set of unique player names
    roles.update(current_roles)

# Convert to a DataFrame
roles_df = pd.DataFrame(list(roles), columns=['Role'])

# Sort values
roles_df = roles_df.sort_values(by=['Role'], ascending=[True])

# Store as xlsx
# players_df.to_excel(f"{DATA_LOCAL_FOLDER}/data/players/Players_2023.xlsx", index=False)

print(len(roles_df))

13
