In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from sklearn.metrics import silhouette_score, davies_bouldin_score
from kmedoids import KMedoids

import warnings
warnings.filterwarnings("ignore")

In [3]:
X = pd.read_csv('../../../data/processed/role_scores_normalized.csv')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
Ks = range(2, 11)
n_iterations = 100

In [5]:
silhouette_wins = {k: 0 for k in Ks}
dbi_wins = {k: 0 for k in Ks}

total_sil_trials = 0
total_dbi_trials = 0

In [11]:
for it in range(n_iterations):
        run_results = []
        for k in Ks:
            kmedoids = KMedoids(
                n_clusters=k,
                metric="cosine",
                init="build",
                random_state=42      
            )

            kmedoids.fit(X_scaled)
            labels = kmedoids.labels_

            sil = silhouette_score(X_scaled, labels, metric="cosine")
            db = davies_bouldin_score(X_scaled, labels)

            run_results.append({
                "k": k,
                "silhouette": sil,
                "davies_bouldin": db,
                "metric": "cosine"
            })

        best_sil_entry = max(run_results, key=lambda r: r["silhouette"])
        best_sil_k = best_sil_entry["k"]
        silhouette_wins[best_sil_k] += 1
        total_sil_trials += 1

        best_dbi_entry = min(run_results, key=lambda r: r["davies_bouldin"])
        best_dbi_k = best_dbi_entry["k"]
        dbi_wins[best_dbi_k] += 1
        total_dbi_trials += 1

In [12]:
print("Silhouette wins (higher is better):")
for k in Ks:
    count = silhouette_wins[k]
    pct = 100.0 * count / total_sil_trials if total_sil_trials > 0 else 0.0
    print(f"k = {k}: {count} wins ({pct:.2f}%)")

Silhouette wins (higher is better):
k = 2: 200 wins (100.00%)
k = 3: 0 wins (0.00%)
k = 4: 0 wins (0.00%)
k = 5: 0 wins (0.00%)
k = 6: 0 wins (0.00%)
k = 7: 0 wins (0.00%)
k = 8: 0 wins (0.00%)
k = 9: 0 wins (0.00%)
k = 10: 0 wins (0.00%)


In [13]:
print("\nDavies–Bouldin wins (lower is better):")
for k in Ks:
    count = dbi_wins[k]
    pct = 100.0 * count / total_dbi_trials if total_dbi_trials > 0 else 0.0
    print(f"k = {k}: {count} wins ({pct:.2f}%)")


Davies–Bouldin wins (lower is better):
k = 2: 200 wins (100.00%)
k = 3: 0 wins (0.00%)
k = 4: 0 wins (0.00%)
k = 5: 0 wins (0.00%)
k = 6: 0 wins (0.00%)
k = 7: 0 wins (0.00%)
k = 8: 0 wins (0.00%)
k = 9: 0 wins (0.00%)
k = 10: 0 wins (0.00%)
