In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from sklearn.metrics import silhouette_score, davies_bouldin_score
from kmedoids import KMedoids

import warnings
warnings.filterwarnings("ignore")

In [68]:
X = pd.read_csv('../../dataset/processed/normalized_features.csv')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [69]:
Ks = range(2, 11)
metrics = ["cosine", "manhattan", "euclidean"]
n_iterations = 100

In [70]:
silhouette_wins_k = {k: 0 for k in Ks}
dbi_wins_k = {k: 0 for k in Ks}

silhouette_wins_metric = {m: 0 for m in metrics}
dbi_wins_metric = {m: 0 for m in metrics}

total_sil_trials = 0
total_dbi_trials = 0

In [None]:
for it in range(n_iterations):
    for metric in metrics:
        run_results = []

        for k in Ks:
            kmedoids = KMedoids(
                n_clusters=k,
                metric=metric,
                method="alternate",
                random_state=42
            )

            kmedoids.fit(X_scaled)
            labels = kmedoids.labels_

            sil = silhouette_score(X_scaled, labels, metric=metric)
            db = davies_bouldin_score(X_scaled, labels)

            run_results.append({
                "k": k,
                "silhouette": sil,
                "davies_bouldin": db,
                "metric": metric
            })

        best_sil_entry = max(run_results, key=lambda r: r["silhouette"])
        best_sil_k = best_sil_entry["k"]
        best_sil_metric = best_sil_entry["metric"]
        silhouette_wins_k[best_sil_k] += 1
        silhouette_wins_metric[best_sil_metric] += 1
        total_sil_trials += 1

        best_dbi_entry = min(run_results, key=lambda r: r["davies_bouldin"])
        best_dbi_k = best_dbi_entry["k"]
        best_dbi_metric = best_dbi_entry["metric"]
        dbi_wins_k[best_dbi_k] += 1
        dbi_wins_metric[best_dbi_metric] += 1
        total_dbi_trials += 1


In [None]:
print("Silhouette wins per k (higher is better):")
for k in Ks:
    count = silhouette_wins_k[k]
    pct = 100.0 * count / total_sil_trials if total_sil_trials > 0 else 0.0
    print(f"k = {k}: {count} wins ({pct:.2f}%)")


Davies–Bouldin wins (lower is better):
k = 2: 100 wins (33.33%)
k = 3: 200 wins (66.67%)
k = 4: 0 wins (0.00%)
k = 5: 0 wins (0.00%)
k = 6: 0 wins (0.00%)
k = 7: 0 wins (0.00%)
k = 8: 0 wins (0.00%)
k = 9: 0 wins (0.00%)
k = 10: 0 wins (0.00%)


In [None]:
print("\nDavies–Bouldin wins per k (lower is better):")
for k in Ks:
    count = dbi_wins_k[k]
    pct = 100.0 * count / total_dbi_trials if total_dbi_trials > 0 else 0.0
    print(f"k = {k}: {count} wins ({pct:.2f}%)")

In [None]:
print("\nSilhouette wins per metric:")
for m in metrics:
    print(f"{m}: {silhouette_wins_metric[m]} wins")

print("\nDavies–Bouldin wins per metric:")
for m in metrics:
    print(f"{m}: {dbi_wins_metric[m]} wins")

In [None]:
x = np.arange(len(metrics))
width = 0.35

sil_counts = [silhouette_wins_metric[m] for m in metrics]
dbi_counts = [dbi_wins_metric[m] for m in metrics]

fig, ax = plt.subplots()
ax.bar(x - width/2, sil_counts, width, label="Silhouette wins")
ax.bar(x + width/2, dbi_counts, width, label="DBI wins")

ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.set_ylabel("Number of wins")
ax.set_title("Number of wins per distance metric")
ax.legend()

plt.tight_layout()
plt.show()