# 05 – Play Style Clustering

Group team-seasons into style clusters using pace, efficiency, shooting, and ball movement.

## Goals
- Standardize key style features and fit K-Means to reveal distinct archetypes.
- Profile clusters by era and success rate.
- Visualize clusters on 2D projections (e.g., pace vs three-point rate).

In [None]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd

from src.pipeline.season_summary import generate_team_season_summary
from src.data_ingest import NBADataIngestor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:
# Load team-season data.
summary_path = Path("data/processed/team_season_regular.csv")
if summary_path.exists():
    summary = pd.read_csv(summary_path)
else:
    ingestor = NBADataIngestor()
    summary = generate_team_season_summary(ingestor, regular_season_only=True, save=False)

summary.head()

In [None]:
# Feature matrix for clustering.
feature_cols = [
    "PACE",
    "OFF_EFF_PER_100",
    "DEF_EFF_PER_100",
    "THREE_POINT_RATE",
    "AST_TOV_RATIO",
]

data = summary.dropna(subset=feature_cols).reset_index(drop=True)
X = data[feature_cols]

# Choose cluster count via silhouette scan.
scores = {}
for k in range(3, 8):
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=k, n_init=20, random_state=42))
    labels = model.fit_predict(X)
    scores[k] = silhouette_score(X, labels)

display(pd.DataFrame.from_dict(scores, orient="index", columns=["silhouette"]).rename_axis("k"))

In [None]:
# Fit final model (pick k based on silhouette peak).
optimal_k = max(scores, key=scores.get)
cluster_model = make_pipeline(StandardScaler(), KMeans(n_clusters=optimal_k, n_init=50, random_state=42))
labels = cluster_model.fit_predict(X)

data["CLUSTER"] = labels
centers = cluster_model.named_steps["kmeans"].cluster_centers_
centers_df = pd.DataFrame(centers, columns=feature_cols)
centers_df["CLUSTER"] = range(optimal_k)
centers_df

In [None]:
# Visualize clusters on key planes.
palette = sns.color_palette("tab10", n_colors=optimal_k)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.scatterplot(
    data=data,
    x="PACE",
    y="THREE_POINT_RATE",
    hue="CLUSTER",
    palette=palette,
    alpha=0.7,
    ax=axes[0],
)
axes[0].set_title("Pace vs 3P Rate by Cluster")

sns.scatterplot(
    data=data,
    x="OFF_EFF_PER_100",
    y="DEF_EFF_PER_100",
    hue="CLUSTER",
    palette=palette,
    alpha=0.7,
    ax=axes[1],
)
axes[1].set_title("Off vs Def Efficiency by Cluster")
plt.tight_layout()
plt.show()

In [None]:
# Cluster prevalence by era and win rate.
if {"ERA_LABEL", "WIN_PCT"}.issubset(data.columns):
    cluster_era = (
        data.groupby(["ERA_LABEL", "CLUSTER"])
        .agg(count=("TEAM_ID", "size"), avg_win=("WIN_PCT", "mean"))
        .reset_index()
    )
    display(cluster_era.sort_values(["ERA_LABEL", "CLUSTER"]))

# Save assignments for reuse if desired.
assignments_path = Path("data/processed/team_clusters.csv")
data[["TEAM_ID", "SEASON_YEAR", "CLUSTER"]].to_csv(assignments_path, index=False)
print(f"Cluster assignments saved to {assignments_path}")

### Gaussian Mixture sanity check
Soft clustering alternative for reporting and to test cluster robustness.

In [None]:
# Fit GMM across candidate cluster counts using BIC/AIC.
gmm_scores = []
for k in range(3, 8):
    gmm = make_pipeline(StandardScaler(), GaussianMixture(n_components=k, random_state=42, n_init=5))
    gmm.fit(X)
    bic = gmm.named_steps["gaussianmixture"].bic(X)
    aic = gmm.named_steps["gaussianmixture"].aic(X)
    gmm_scores.append({"k": k, "bic": bic, "aic": aic})

gmm_df = pd.DataFrame(gmm_scores).set_index("k")
gmm_df


In [None]:
# Responsibilities for the best BIC model.
best_k = gmm_df["bic"].idxmin()
best_gmm = make_pipeline(StandardScaler(), GaussianMixture(n_components=int(best_k), random_state=42, n_init=10))
best_gmm.fit(X)
probs = best_gmm.named_steps["gaussianmixture"].predict_proba(X)

data_gmm = data.copy()
data_gmm[[f"GMM_{i}" for i in range(int(best_k))]] = probs

# 2D PCA projection colored by argmax responsibility.
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(StandardScaler().fit_transform(X))
data_gmm["PC1"], data_gmm["PC2"] = coords[:, 0], coords[:, 1]
data_gmm["GMM_LABEL"] = probs.argmax(axis=1)

sns.scatterplot(data=data_gmm, x="PC1", y="PC2", hue="GMM_LABEL", palette="tab10", alpha=0.7)
plt.title(f"GMM responsibilities (k={best_k})")
plt.tight_layout()
plt.show()

### K-Means stability across seeds
Check silhouette variation to report robustness.

In [None]:
stability = []
for seed in [1, 7, 21, 42, 84]:
    model = make_pipeline(StandardScaler(), KMeans(n_clusters=optimal_k, n_init=20, random_state=seed))
    labels_seed = model.fit_predict(X)
    stability.append({"seed": seed, "silhouette": silhouette_score(X, labels_seed)})

stability_df = pd.DataFrame(stability).sort_values("silhouette", ascending=False)
stability_df


### Case study table (cluster exemplars)
Top win% team-seasons per cluster for narrative examples in the report.

In [None]:
case_rows = []
for cluster_id in sorted(data["CLUSTER"].unique()):
    cluster_slice = data[data["CLUSTER"] == cluster_id]
    top = cluster_slice.sort_values("WIN_PCT", ascending=False).head(2)
    for _, row in top.iterrows():
        case_rows.append(
            {
                "cluster": cluster_id,
                "team": row.get("TEAM_ID", "NA"),
                "season": int(row.get("SEASON_YEAR", 0)),
                "win_pct": row.get("WIN_PCT", np.nan),
                "pace": row.get("PACE", np.nan),
                "three_pt_rate": row.get("THREE_POINT_RATE", np.nan),
            }
        )

case_table = pd.DataFrame(case_rows).sort_values(["cluster", "win_pct"], ascending=[True, False])
case_table


These outputs (GMM BIC/AIC, seed stability, exemplar teams) can be dropped into LaTeX tables to satisfy the rubric’s comparison and case-study expectations.

### Cluster transitions by franchise
Track how selected franchises move across clusters over time for narrative context.

In [None]:
# Count cluster switches per team and show a few exemplars.
team_switches = (
    data.sort_values(["TEAM_ID", "SEASON_YEAR"])
    .groupby("TEAM_ID")
    .apply(lambda df: (df["CLUSTER"].diff().ne(0)).sum())
    .reset_index(name="cluster_switches")
    .sort_values("cluster_switches", ascending=False)
)
team_switches.head()

# Plot transitions for a handful of franchises if available.
watch_list = ["GSW", "HOU", "SAS", "BOS", "LAL", "DAL"]
plot_df = data[data["TEAM_ID"].isin(watch_list)].copy()
if not plot_df.empty:
    fig, ax = plt.subplots(figsize=(10, 5))
    for team, group in plot_df.groupby("TEAM_ID"):
        ax.plot(group["SEASON_YEAR"], group["CLUSTER"], marker="o", label=team)
    ax.set_ylabel("Cluster ID")
    ax.set_xlabel("Season year")
    ax.set_title("Cluster transitions for selected franchises")
    ax.legend(ncol=3)
    plt.tight_layout()
    plt.show()
else:
    print("No watch-list teams present in the dataset sample.")


## Next Steps
- Try alternative clustering (Gaussian Mixture, HDBSCAN) to capture soft boundaries.
- Include defensive profile features (opponent 3P rate, turnover forcing) if available.
- Track cluster transitions for franchises over time to narrate stylistic evolution.