In [1]:
from statsbombpy import sb
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Raw event data

In [2]:
sb.events(match_id=3890324)



Unnamed: 0,50_50,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,clearance_other,...,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
0,,,,,,,,,,,...,,,,,"{'formation': 442, 'lineup': [{'player': {'id'...",Hertha Berlin,173,00:00:00.000,Starting XI,
1,,,,,,,,,,,...,,,,,"{'formation': 4411, 'lineup': [{'player': {'id...",Hamburger SV,171,00:00:00.000,Starting XI,
2,,,,,,,,,,,...,,,,,,Hamburger SV,171,00:00:00.000,Half Start,
3,,,,,,,,,,,...,,,,,,Hertha Berlin,173,00:00:00.000,Half Start,
4,,,,,,,,,,,...,,,,,,Hertha Berlin,173,00:00:00.000,Half Start,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3864,,,,,,,,,,,...,,,,,,Hamburger SV,171,00:46:02.648,Half End,
3865,,,,,,,,,,,...,,,,,,Hertha Berlin,173,00:45:08.897,Half End,
3866,,,,,,,,,,,...,,,,,,Hamburger SV,171,00:45:08.897,Half End,
3867,,,,,,,,,,,...,,,,,"{'formation': 4411, 'lineup': [{'player': {'id...",Hamburger SV,171,00:16:41.210,Tactical Shift,


### Load Features

In [20]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def load_and_process_datasets(file_paths, player_column='player', n_components=10):
    """
    Load multiple CSV files, perform PCA on each, and merge the PCA components.
    
    Parameters:
    file_paths (list): List of CSV file paths
    player_column (str): Name of the column containing player names
    n_components (int): Number of PCA components per dataset
    
    Returns:
    pandas.DataFrame: DataFrame with merged PCA components
    dict: Dictionary with explained variance for each dataset
    """
    all_pca_dfs = []
    explained_variance_dict = {}
    
    for file_path in file_paths:
        # Extract dataset name from file path
        dataset_name = os.path.basename(file_path).replace('.csv', '')
        
        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Check if player column exists
        if player_column not in df.columns:
            print(f"Warning: '{player_column}' column not found in {dataset_name}. Skipping.")
            continue
        
        # Set player as index
        df_indexed = df.set_index(player_column)
        
        # Convert to numeric, handling any non-numeric values
        numeric_df = df_indexed.select_dtypes(include=['number'])
        
        # If no numeric columns remain, skip this dataset
        if numeric_df.shape[1] == 0:
            print(f"Warning: No numeric columns found in {dataset_name} after filtering. Skipping.")
            continue
        
        # Fill missing values with 0
        numeric_df = numeric_df.fillna(0)
        
        # Standardize the data
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(numeric_df)
        
        # Check if we have enough features for requested components
        actual_components = min(n_components, min(numeric_df.shape) - 1)
        if actual_components < n_components:
            print(f"Warning: Reduced PCA components to {actual_components} for {dataset_name} due to data dimensions")
        
        # Apply PCA
        pca = PCA(n_components=actual_components)
        pca_result = pca.fit_transform(scaled_data)
        
        # Create a DataFrame with the PCA results, adding dataset name as prefix
        column_names = [f"{dataset_name}_pc{i+1}" for i in range(actual_components)]
        pca_df = pd.DataFrame(
            data=pca_result,
            index=numeric_df.index,
            columns=column_names
        )
        
        # Store the explained variance
        explained_variance = np.sum(pca.explained_variance_ratio_) * 100
        explained_variance_dict[dataset_name] = explained_variance
        
        # Add to the list of PCA DataFrames
        all_pca_dfs.append(pca_df)
    
    # Merge all PCA DataFrames on player index
    if not all_pca_dfs:
        raise ValueError("No valid datasets to process")
    
    merged_pca_df = pd.concat(all_pca_dfs, axis=1)
    
    # Handle players that don't exist in all datasets by filling NaN with 0
    merged_pca_df = merged_pca_df.fillna(0)
    
    return merged_pca_df, explained_variance_dict


def calculate_player_similarity(merged_pca_df, player_name):
    """
    Calculate cosine similarity between a selected player and all other players
    using the merged PCA components.
    
    Parameters:
    merged_pca_df (pandas.DataFrame): DataFrame with merged PCA components
    player_name (str): Name of the player to compare with others
    
    Returns:
    pandas.DataFrame: DataFrame with players sorted by similarity (descending)
    """
    # Make sure the player exists in the dataset
    if player_name not in merged_pca_df.index:
        raise ValueError(f"Player '{player_name}' not found in the merged dataset")
    
    # Get the selected player's data
    player_data = merged_pca_df.loc[player_name].values.reshape(1, -1)
    
    # Calculate cosine similarity with all players
    similarities = cosine_similarity(player_data, merged_pca_df)
    
    # Create a DataFrame with the results
    similarity_df = pd.DataFrame({
        'player': merged_pca_df.index,
        'similarity_score': similarities[0]
    })
    
    # Sort by similarity score in descending order
    similarity_df = similarity_df.sort_values('similarity_score', ascending=False)
    
    return similarity_df



# List of CSV files to process
file_paths = [
    "../data/defending.csv",
    "../data/possession.csv",
    "../data/shooting.csv",
    "../data/passing.csv",
    "../data/goal_keeping.csv" 
]

# standard stats
df_standard_stats = pd.read_csv("../data/standard_stats.csv")


# Number of PCA components per dataset
n_components = 10

# Select a player
selected_player = "Joshua Kimmich"  # Replace with your player of interest


try:
    # Load and process datasets
    print(f"Loading and processing {len(file_paths)} datasets...")
    merged_pca_df, explained_variance_dict = load_and_process_datasets(
        file_paths, 
        player_column='player', 
        n_components=n_components
    )
    
    print(f"Merged PCA dataset shape: {merged_pca_df.shape}")
    print(f"Number of players in merged dataset: {len(merged_pca_df.index)}")
    
    # Display explained variance for each dataset
    print("\nExplained variance by dataset:")
    for dataset, variance in explained_variance_dict.items():
        print(f"  {dataset}: {variance:.2f}%")
    
    # Calculate similarities
    print(f"\nCalculating similarities for {selected_player}...")
    similarity_results = calculate_player_similarity(merged_pca_df, selected_player).round(3)
    


    # Display results
    print(f"\nPlayers most similar to {selected_player}:")
    display(similarity_results)
    
except Exception as e:
    print(f"Error: {e}")



Loading and processing 5 datasets...
Merged PCA dataset shape: (473, 50)
Number of players in merged dataset: 473

Explained variance by dataset:
  defending: 76.03%
  possession: 86.50%
  shooting: 88.02%
  passing: 81.37%
  goal_keeping: 100.00%

Calculating similarities for Joshua Kimmich...

Players most similar to Joshua Kimmich:


Unnamed: 0,player,similarity_score
192,Joshua Kimmich,1.000
406,Stefan Reinartz,0.749
453,Xabier Alonso Olano,0.684
371,Roman Neustädter,0.606
215,Kevin Vogt,0.595
...,...,...
26,André Hahn,-0.505
233,Levin Mete Öztunali,-0.506
411,Sven Schipplock,-0.511
127,Franco Matías Di Santo,-0.525
