# Evaluation approaches - Unsupervised Learning

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

In [2]:
n_samples = 300
n_features = 2
n_clusters = 4
random_state = 42
X, y = make_blobs(n_samples=n_samples, 
                  n_features=n_features, 
                  centers=n_clusters, 
                  random_state=random_state)

## Elbow method plot

In [3]:
def plot_elbow_method(X, model_class, 
                      max_k=10, figsize=(8, 6), 
                      title='Elbow Method For Optimal K', 
                      xlabel='Number of clusters (K)', 
                      ylabel='Inertia (SSE)', 
                      grid=True, **kwargs):
    inertia = []
    k_val_rng = range(1, max_k + 1)  
    for k in k_val_rng:
        model = model_class(n_clusters=k, **kwargs)
        model.fit(X)
        if hasattr(model, 'inertia_'):
            inertia.append(model.inertia_)
        else:
            print(f"{model_class.__name__} doesn't have 'inertia_' for k={k}. Skipping...")
            break
    if inertia:
        plt.figure(figsize=figsize)
        plt.plot(k_val_rng, inertia, marker='o')
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        if grid:
            plt.grid(True)
        plt.show()

In [None]:
plot_elbow_method(X=X, model_class=KMeans, max_k=10)

## K-Means method

In [None]:
print('Initialising empty lists for storing evaluation metrics')
silhouette_scores = []
davies_bouldin_scores = []
ari_scores = []

### Loop through number of clusters to find optimal cluster value

In [5]:
k_vals = range(2, 11)  # Defines cluster numbers from 2 to 10

for k in k_vals:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    labels = model.labels_
    
    # Append metrics
    silhouette_scores.append(silhouette_score(X, labels))
    davies_bouldin_scores.append(davies_bouldin_score(X, labels))
    ari_scores.append(adjusted_rand_score(y, labels))

In [None]:
print(f"Length of k_vals: {len(k_vals)}")
print(f"Length of silhouette_scores: {len(silhouette_scores)}")
print(f"Length of davies_bouldin_scores: {len(davies_bouldin_scores)}")
print(f"Length of ari_scores: {len(ari_scores)}")

## Plot results

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
axes[0].plot(k_vals, silhouette_scores, marker='o', color='blue')
axes[0].set_title('Silhouette Score vs. Number of Clusters (K-Means)')
axes[0].set_xlabel('Number of clusters (K)')
axes[0].set_ylabel('Silhouette Score')
axes[0].grid(True)

axes[1].plot(k_vals, davies_bouldin_scores, marker='o', color='red')
axes[1].set_title('Davies-Bouldin Index vs. Number of Clusters')
axes[1].set_xlabel('Number of clusters (K)')
axes[1].set_ylabel('Davies-Bouldin Index')
axes[1].grid(True)

axes[2].plot(k_vals, ari_scores, marker='o', color='green')
axes[2].set_title('Adjusted Rand Index vs. Number of Clusters')
axes[2].set_xlabel('Number of clusters (K)')
axes[2].set_ylabel('Adjusted Rand Index')
axes[2].grid(True)

plt.tight_layout()
plt.show()

## Silhouette plot

In [8]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import numpy as np

def silhouette_plot(X, labels, n_clusters, 
                    title="Silhouette Plot for Clustering",
                    fig_size=(8,6),
                    x_lab='Coefficient Values',
                    y_lab='Cluster',
                    silh_colors=None,
                    x_axis_line_col='black'):

    sil_vals = silhouette_samples(X, labels)
    mean_score = silhouette_score(X, labels)

    _, ax = plt.subplots(figsize=fig_size)
    y_lower = 10

    # Use default colours if none are provided
    if silh_colors is None:
        silh_colors = plt.cm.tab10.colors  # Default colour palette

    for i in range(n_clusters):
        cluster_sil_vals = sil_vals[labels == i]
        cluster_sil_vals.sort()
        y_upper = y_lower + len(cluster_sil_vals)

        colour = silh_colors[i % len(silh_colors)]  

        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0, cluster_sil_vals, alpha=0.7, color=colour
        )
        ax.text(-0.05, y_lower + 0.5 * len(cluster_sil_vals), str(i))
        
        y_lower = y_upper + 10

    ax.axvline(x=mean_score, color=x_axis_line_col, linestyle="--")
    ax.set_title(title)
    ax.set_xlabel(x_lab)
    ax.set_ylabel(y_lab)
    ax.set_yticks([]) 
    grid_lines = np.arange(-0.1, 1.1, 0.1)
    plt.show()


In [None]:
silhouette_plot(X, labels, n_clusters, 
                silh_colors=['red', 'black', 'navy','orange'])