In [125]:
import pandas as pd
import utils

df_team_info_squad, df_team_info_opponent, df_squad_features, df_opponent_features = utils.load_team_statistics()
df_features, df_player_info = utils.load_player_statistics()


### Euclidean, Cosine, Mahalanobis Distance
Mahalanobis Distance seems quite promising incombination with pca

In [126]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from scipy.spatial.distance import mahalanobis
import numpy as np
from sklearn.decomposition import PCA

# Step 1: Normalize the data

def normalize_data(player_stats, team_stats, n_components=50):
    """
    Normalize player and team stats using Z-score normalization.
    """
    # Step 1: Normalize the data
    scaler = StandardScaler()
    
    # Fit the scaler on team stats to normalize both player and team stats on the same scale
    team_stats_normalized = scaler.fit_transform(team_stats)
    player_stats_normalized = scaler.transform([player_stats])  # We normalize the player stats separately but use the same scaler
    
    # Step 2: Apply PCA for dimensionality reduction
    pca = PCA(n_components=n_components)
    
    # Fit PCA on the team stats (since teams have more data) and transform both player and team stats
    team_stats_reduced = pca.fit_transform(team_stats_normalized)
    player_stats_reduced = pca.transform(player_stats_normalized)[0]  # We need only the transformed player stats array
    
    return player_stats_reduced, team_stats_reduced

# Step 2: Calculate Euclidean Distance and Cosine Similarity

def calculate_distances(player_stats_normalized, team_stats_normalized):
    """
    Calculate Euclidean distance and Cosine similarity between Player A's stats and each team's stats.
    """
    # Euclidean distance between player and teams
    euclidean_dist = euclidean_distances([player_stats_normalized], team_stats_normalized)[0]
    
    # Cosine similarity between player and teams
    cosine_sim = cosine_similarity([player_stats_normalized], team_stats_normalized)[0]

    # Covariance matrix of the team stats
    cov_matrix = np.cov(team_stats_normalized, rowvar=False)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    
    # Calculate Mahalanobis distance for each team
    maha_distances = [mahalanobis(player_stats_normalized, team, inv_cov_matrix) for team in team_stats_normalized]
    
    return euclidean_dist, cosine_sim, maha_distances

# Step 3: Rank Teams by Similarity

def rank_teams(euclidean_dist, cosine_sim, maha_distances, team_names):
    """
    Rank teams based on Euclidean distance, Cosine similarity, and Mahalanobis distance.
    
    Args:
        euclidean_dist (array): Euclidean distances between player and teams.
        cosine_sim (array): Cosine similarities between player and teams.
        maha_distances (array): Mahalanobis distances between player and teams.
        team_names (list): List of team names corresponding to the stats.
    
    Returns:
        pd.DataFrame: DataFrame containing teams, distances, similarities, and rankings.
    """
    # Create a DataFrame to store the results
    similarity_df = pd.DataFrame({
        'Team': team_names,
        'Euclidean_Distance': euclidean_dist,
        'Cosine_Similarity': cosine_sim,
        'Mahalanobis_Distance': maha_distances
    })
    
    # Rank teams by Euclidean distance (lower is more similar)
    similarity_df['Euclidean_Rank'] = similarity_df['Euclidean_Distance'].rank(method='min')
    
    # Rank teams by Mahalanobis distance (lower is more similar)
    similarity_df['Mahalanobis_Rank'] = similarity_df['Mahalanobis_Distance'].rank(method='min')
    
    # Rank teams by Cosine similarity (higher is better, so reverse the rank)
    similarity_df['Cosine_Rank'] = similarity_df['Cosine_Similarity'].rank(method='min', ascending=False)
    
    # Sort by the Euclidean distance first, but you can change the sorting based on your preference
    similarity_df = similarity_df.sort_values(by=['Euclidean_Distance', 'Cosine_Similarity','Mahalanobis_Distance']).reset_index(drop=True)
    
    return similarity_df




In [127]:
player = df_features.loc['Dani Olmo']

player_stats_normalized, team_stats_normalized = normalize_data(player, df_squad_features, n_components=13)

# Calculate distances
euclidean_dist, cosine_sim, mahalanobis = calculate_distances(player_stats_normalized, team_stats_normalized)

# Rank teams
ranked_teams = rank_teams(euclidean_dist, cosine_sim,mahalanobis, df_squad_features.index)

ranked_teams.head(50)



Unnamed: 0,Team,Euclidean_Distance,Cosine_Similarity,Mahalanobis_Distance,Euclidean_Rank,Mahalanobis_Rank,Cosine_Rank
0,Boavista,45.423127,0.818916,17.333974,1.0,4.0,1.0
1,Vizela,45.699992,0.697833,17.249744,2.0,2.0,4.0
2,Vitória,45.829232,0.732765,17.293077,3.0,3.0,2.0
3,Paços,45.930038,0.602127,17.765422,4.0,7.0,5.0
4,Estoril,46.48379,0.551122,17.993658,5.0,13.0,13.0
5,Santa Clara,46.538522,0.514355,17.856898,6.0,8.0,17.0
6,Hertha BSC,46.563936,0.526677,17.963002,7.0,12.0,16.0
7,Freiburg,46.797183,0.578904,18.594225,8.0,24.0,9.0
8,Hoffenheim,46.910338,0.712093,18.147366,9.0,16.0,3.0
9,Casa Pia,46.976767,0.571183,17.900687,10.0,10.0,11.0


first predict the best cluster for the player using Euclidean distance and then compute the Cosine similarity only for the teams within that predicted cluster.

In [128]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean, mahalanobis
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Normalize and Reduce Data
def normalize_and_reduce_data(player_stats, team_stats, n_components=25):
    """
    Normalize player and team stats and apply PCA for dimensionality reduction.
    """
    # Step 1: Normalize the data
    scaler = StandardScaler()
    
    # Fit the scaler on team stats to normalize both player and team stats on the same scale
    team_stats_normalized = scaler.fit_transform(team_stats)
    player_stats_normalized = scaler.transform([player_stats])
    
    # Step 2: Apply PCA for dimensionality reduction
    pca = PCA(n_components=n_components)
    
    # Fit PCA on the team stats and transform both player and team stats
    team_stats_reduced = pca.fit_transform(team_stats_normalized)
    player_stats_reduced = pca.transform(player_stats_normalized)[0]
    
    return player_stats_reduced, team_stats_reduced

# Step 2: Cluster Teams using K-means
def cluster_teams(team_stats_normalized, n_clusters=12):
    """
    Cluster teams using K-means and return labels and cluster centers.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(team_stats_normalized)
    
    return kmeans.labels_, kmeans.cluster_centers_

# Step 3: Calculate distances between player and clusters
def calculate_cluster_distances(player_stats, cluster_centers):
    """
    Calculate the Euclidean and Cosine distances between player stats and each cluster centroid.
    """
    euclidean_dist = [euclidean(player_stats, centroid) for centroid in cluster_centers]
    cosine_sim = [cosine_similarity([player_stats], [centroid])[0][0] for centroid in cluster_centers]
    
    return np.array(euclidean_dist), np.array(cosine_sim)

# Step 4: Find best cluster and then best team within that cluster
def find_best_cluster_and_team(player_stats, team_stats, team_names, cluster_labels, cluster_centers):
    """
    Find the best matching cluster for the player, and then compute Cosine similarity only for teams in that cluster.
    """
    # Step 1: Compare player stats to cluster centroids using Euclidean distance
    euclidean_dist, _ = calculate_cluster_distances(player_stats, cluster_centers)
    
    # Predict the best cluster (minimum Euclidean distance to player)
    best_cluster_idx = np.argmin(euclidean_dist)
    
    # Step 2: Filter teams belonging to the best cluster
    teams_in_best_cluster = team_stats[cluster_labels == best_cluster_idx]
    team_names_in_best_cluster = team_names[cluster_labels == best_cluster_idx]
    
    # Step 3: Calculate Cosine similarity only for the teams in the predicted best cluster
    cosine_sim_teams = [cosine_similarity([player_stats], [team])[0][0] for team in teams_in_best_cluster]
    
    # Create a DataFrame to rank teams in the best cluster
    ranking_df = pd.DataFrame({
        'Team': team_names_in_best_cluster,
        'Cosine_Similarity': cosine_sim_teams
    })
    
    # Sort by Cosine similarity (higher is better)
    ranking_df = ranking_df.sort_values(by='Cosine_Similarity', ascending=False).reset_index(drop=True)
    
    return ranking_df, best_cluster_idx

# Example usage
# Assuming player is a 137-dimensional vector and df_squad_features is (134, 137)

player_stats = player  # Assuming this is already loaded as a vector of shape (137,)
team_stats = df_squad_features.values  # Convert team DataFrame to numpy array

# Normalize and apply PCA
player_stats_reduced, team_stats_reduced = normalize_and_reduce_data(player_stats, team_stats)

# Apply clustering to team stats
cluster_labels, cluster_centers = cluster_teams(team_stats_reduced, n_clusters=10)

# Find the best cluster and rank teams within that cluster
ranked_teams_in_best_cluster, best_cluster_idx = find_best_cluster_and_team(player_stats_reduced, team_stats_reduced, df_squad_features.index, cluster_labels, cluster_centers)

# Display the top 5 teams in the best matching cluster
ranked_teams_in_best_cluster


  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,Team,Cosine_Similarity
0,Boavista,0.651892
1,Vizela,0.641245
2,Vitória,0.616073
3,Rio Ave,0.581834
4,Casa Pia,0.562854
5,Paços,0.55259
6,Estoril,0.527773
7,Santa Clara,0.467174
8,Famalicão,0.463756
9,Portimonense,0.436268


The scaled, components have the same value range kind of

In [129]:
player_stats_reduced

array([-16.04214419,  33.76527115,  23.54844534,  -2.12260982,
        17.20117579,   2.73009127, -11.75001741, -12.05305133,
        -2.0685334 ,   4.84269055,   8.40954838,   3.13258029,
        -4.68764095,   5.38840282,   0.91313912,  -4.84889588,
        -2.90729919,   2.32464599,   6.16045638,   0.78169654,
         1.28956955,  -3.07772395,   8.96577556,  -2.06297833,
        -2.02899739])

In [130]:
team_stats_reduced[:,0]

array([ 17.8956769 ,  11.54779714,   5.70155027,   5.48888721,
        11.85935654,  10.06765328,  -0.08902285,   3.51417981,
        -3.96223771,  -0.25482879,  -1.44267548,   5.37459051,
        -2.97765703,  -3.29578063,  -7.4791988 , -10.47685249,
        -6.02440992,  -2.17681776,  -4.31298591,  -5.8849563 ,
        14.89294534,   9.92094548,   6.31518238,  -5.05724956,
        -3.07832135,   1.93190343,   0.63320962,  -1.41354021,
        -5.86083428,  -0.22096508,  -2.92061858,  -4.04070215,
        -4.737393  , -11.15352439, -10.59581093,  -0.58754856,
        -9.43387715,  -9.5479362 ,  16.00117672,  16.09264473,
         5.7332445 ,   2.90114795,   6.76818541,  -0.69807344,
        -2.46462145,   3.85766834,  -7.5624691 ,   1.10841224,
        -0.46232567,   0.26149284,  -0.11298228,  -9.47615275,
        -9.83032573,   0.08119767,  -4.50203495,  -4.37541614,
        -6.00799836,  -6.30071332,  17.11940556,   9.60221557,
         8.17780894,   8.91670141,  10.78910018,   4.01