In [None]:
%load_ext autoreload
%autoreload 2

First clustering test, with kmeans and dbscan on the specific context: vary low variance, no relevant clusters are detected.
✅ Perché la varianza è bassa nel tuo caso
Stai analizzando comportamenti in contesti molto specifici (es. "1st serve su terra").

In questi contesti, le strategie sono naturalmente simili tra giocatori: tutti affrontano condizioni e dinamiche simili.

Quindi è normale che le feature (rate di slice, vincenti, lunghezza media degli scambi, ecc.) abbiano variazioni minime, da cui una PCA schiacciata e clustering difficile.

🧭 Strategia: allargare progressivamente il contesto
Ottima idea. Puoi procedere così:

1. Contesto medio-specializzato
Es. "tutti i punti al servizio su terra", unendo 1st e 2nd.

2. Contesto più ampio
Es. "tutti i punti al servizio" indipendentemente dalla superficie.

3. Contesto generale
Tutti i colpi del giocatore, aggregando tutto (serve/response, superfici).

🎯 Obiettivo finale:
Capire a che livello di granularità emergono:

differenze significative tra stili (più cluster distinti, silhouette score alto)

oppure confermare che alcuni contesti sono intrinsecamente “uniformanti” (→ bassa varianza, stile unico richiesto).

Uploading of the all 6 dataset. Each dataset represents a context of type {point type}-{surface}.

In [None]:
import os
import pandas as pd

dfs = []
features_dataset_dir = '../feature_datasets_csv_reduced_contexts&features/'

for dataset_name in os.listdir(features_dataset_dir):
    if dataset_name.endswith('.csv'):
        file_path = os.path.join(features_dataset_dir, dataset_name)
        dfs.append((dataset_name, pd.read_csv(file_path, low_memory=False)))
umap_components = (2, 8)

Variance visualization and hopkins statistic. A 0.75 Hopkins stat indicates clustering tendency with a 90% confidence level.

In [None]:
from sklearn.preprocessing import StandardScaler
from run_clustering import hopkins

for name, dataframe in dfs:
    feature_cols = [col for col in dataframe.columns if col not in ["player"]]
    print(name)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(dataframe[feature_cols].values)
    dataframe[feature_cols].var().sort_values()
    mean, std_dev, values = hopkins(X_scaled)
    print(f"Hopkins statistic (df): {mean:.3f} ± {std_dev:.3f}\n")

UMAP.

In [None]:
from sklearn.preprocessing import StandardScaler
from run_clustering import continuity
from sklearn.manifold import trustworthiness
import umap

for name, dataframe in dfs:
    feature_cols = [col for col in dataframe.columns if col not in ["player"]]
    # z-score normalization
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(dataframe[feature_cols])
    for umap_comp in umap_components:
        # UMAP reduction
        reducer = umap.UMAP(n_components=umap_comp)
        X_umap = reducer.fit_transform(X_scaled)
        c_score = continuity(X_scaled, X_umap, n_neighbors=15)
        print(f"Continuity: {c_score:.3f}")
        trust = trustworthiness(X_scaled, X_umap, n_neighbors=15)
        print(f"Trustworthiness UMAP {umap_comp}D: {trust:.3f}")

KMeans clustering:

In [None]:
from run_clustering import *

dfs_clustered = []

for name, dataframe in dfs:
    print(name)
    for umap_comp in umap_components:
        metrics = []
        df_clustered, silhouette_score, dbi = run_kmeans_clustering(dataframe, 4, True, umap_comp)
        dfs_clustered.append((name, df_clustered))
        metrics.append((silhouette_score, dbi, umap_comp))
    print({name: max(metrics, key=lambda x: x[0])})


Agglomerative clustering:

In [None]:
from run_clustering import *

for name, dataframe in dfs:
    print(name)
    for umap_comp in umap_components:
        silhouette_scores = []
        df_clustered, silhouette_score, dbi = run_umap_agglomerative_clustering(dataframe, "player", 4, 2, 'complete',
                                                                                True)
        silhouette_scores.append((silhouette_score, umap_comp))
    print({name: max(silhouette_scores, key=lambda x: x[1])})

Optics reachability plot:

In [None]:
from run_clustering import *

for name, dataframe in dfs:
    print(name)
    for min_pts in range(8, 9):
        for xi in np.arange(start=0.1, stop=0.11, step=0.01):
            df_clustering = run_umap_optics_clustering(dataframe, min_pts=min_pts, xi=xi, context_cols="player")

GMM cluster:

In [None]:
from run_clustering import run_umap_gmm_clustering

for umap_comp in umap_components:
    for name, dataframe in dfs:
        print(name)
        for umap_knn in [10, 15, 20, 25, 30]:
            print("nearest neighbors umap: ", umap_knn)
            run_umap_gmm_clustering(dataframe, context_cols="player", umap_dim=umap_comp, visualize=False)


Agglomerate statistic by cluster visualization:

In [None]:
from visualize_results import cluster_feature_deltas
from visualize_results import aggregate_features_by_cluster
import seaborn as sns
import matplotlib.pyplot as plt

# stampa le features aggregate per cluster
aggregated = aggregate_features_by_cluster(df_clustered)
print(aggregated)

# stampa la hitmap della differenza nelle statistiche tra i clusters e la media globale
sns.heatmap(cluster_feature_deltas(df_clustered), annot=True, cmap='coolwarm')
plt.title("Scostamenti medi delle feature per cluster")
plt.show()


Save dfs clustered in files.

In [None]:
import os

features_dataset_clustered_dir = '../feature_datasets_csv_reduced_contexts&features_clustered/'
for name, df_clustered in dfs_clustered:
    file_path = os.path.join(features_dataset_clustered_dir, name)
    df_clustered.to_csv(file_path, index=False)

Upload dfs clustered.

In [None]:
import os
import pandas as pd

features_dataset_clustered_dir = '../feature_datasets_csv_reduced_contexts&features_clustered/'
dfs_clustered = []
for dataset_name in os.listdir(features_dataset_clustered_dir):
    if dataset_name.endswith('.csv'):
        file_path = os.path.join(features_dataset_clustered_dir, dataset_name)
        dfs_clustered.append((dataset_name, pd.read_csv(file_path)))

Calculate for each context, for each cluster, the top n representative features, i.e. the ones that have the highest difference between the cluster centroid feature and the overall feature mean.

In [None]:
from visualize_results import analyze_cluster_profiles

for name, df_clustered in dfs_clustered:
    print(name)
    analyze_cluster_profiles(df_clustered, context=name.replace(".csv", ""))

Visualize similarity matrix and heatmap. The similarity matrix calculate the distance between each cluster in a context with each other cluster in different context. For each cluster, the 3 most similar clusters in each the 3 surfaces represent the same style

In [None]:
from visualize_results import calculate_centroid_similarity

# 3. Calcolo della Similarità tra Centroidi
clustered_data_on_serve = []
clustered_data_on_response = []
for context, df_clustered in dfs_clustered:
    if "on serve" in context:
        clustered_data_on_serve.append((context, df_clustered))
    else:
        clustered_data_on_response.append((context, df_clustered))
similarity_df_on_serve = calculate_centroid_similarity(clustered_data_on_serve)
similarity_df_on_response = calculate_centroid_similarity(clustered_data_on_response)

In [None]:
from visualize_results import visualize_similarity_matrix

# 4. Visualizzazione della Matrice di Similarità
visualize_similarity_matrix(similarity_df_on_serve, clustered_data_on_serve)
visualize_similarity_matrix(similarity_df_on_response, clustered_data_on_response)

Calculate for each point context (serve, response) 4 triplets where 4 is the number of playing styles. Each triplet is composed by a cluster for each surface. The goal is to find the best similarity of each cluster of a surface with the others clusters of the other surfaces.

In [None]:
#todo probabilmente non serve a niente, modificare in modo tale da stampare per ogni contest/cluster i due cluster sulle diverse superfici piu simili
from visualize_results import find_closest_cluster_triplets

on_serve_contexts = [context for context, _ in dfs_clustered if "on serve" in context]
on_response_contexts = [context for context, _ in dfs_clustered if "on response" in context]
print(find_closest_cluster_triplets(similarity_df_on_serve, on_serve_contexts))
print(find_closest_cluster_triplets(similarity_df_on_response, on_response_contexts))


Visualize top players trajectories.

In [None]:
from visualize_results import create_player_trajectories, display_top_player_trajectories

display_top_player_trajectories(create_player_trajectories(dfs_clustered))

Visualize player trajectory by name.

In [None]:
from visualize_results import create_player_trajectories, visualize_player_trajectory

visualize_player_trajectory(create_player_trajectories(dfs_clustered), "Roger_Federer", dfs_clustered,
                            top_features=True)

Visualize most frequent patterns.

In [None]:
from costants import PLAYER_SURFACES_DICT
from decode_point import decode_point
from get_freq_shots_seqs import frequent_shots_py_player

df_points = pd.read_csv('../points_datasets/charting-m-points-2020s.csv', low_memory=False)

print("Legend: * = winner, # = forced error, @ = unforced error\n")
for surface in PLAYER_SURFACES_DICT.get("Carlos_Alcaraz", []):
    for context in ["on serve", "on response"]:
        print("\n"+context + " on " + surface + "\n")
        for sequence, support, win_percentage, most_frequent_outcome in frequent_shots_py_player("Carlos_Alcaraz",df_points, surface, context):
            shots = []
            for shot in sequence:
                shots.append(decode_point(shot))
            print(
                f"Sequence: {shots} | Support: {support} | Win percentage: {win_percentage} | Most frequent outcome: {most_frequent_outcome}\n")
