In [61]:
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns
sns.set_theme()

In [28]:
data = datasets.load_wine(as_frame=True)
df = data.frame

X = df.drop(columns='target').values
y = df['target'].values

In [20]:
def initialize_centers(X, k):
    indices = np.random.choice(len(X), size=k, replace=False)
    return X[indices]

In [22]:

def assign_clusters(X, centers):
    # Compute distances from all points to all centers
    dists = np.linalg.norm(X[:, np.newaxis, :] - centers[np.newaxis, :, :], axis=2)
    return np.argmin(dists, axis=1)

In [23]:

def update_centers(X, labels, k):
    new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)])
    return new_centers

In [47]:

def k_means_clustering(X, k, max_iter=100):
    centers = initialize_centers(X, k)

    for _ in range(max_iter):
        labels = assign_clusters(X, centers)
        new_centers = update_centers(X, labels, k)

        # Optional: check for convergence
        if np.allclose(centers, new_centers):
            break
        centers = new_centers

    return centers, labels

In [60]:
centers, labels = k_means_clustering(X, 3)

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# These metrics are label-invariant and measure how well clustering matches ground truth
print("Adjusted Rand Index:", adjusted_rand_score(y, labels))
print("Normalized Mutual Information:", normalized_mutual_info_score(y, labels))

Adjusted Rand Index: 0.33890518316180995
Normalized Mutual Information: 0.41339346113562453


Adjusted Rand Index is on a scale from -1 to 1, where 0 suggests that the clusters that were formed are effectively random, a negative number suggests that the clusters are selected in a way that is even worse than random, relative to the true labels, and a 1 implies a perfect clustering. Normalized Mutual information measures a similar thing, being the shared information between true labels and cluster assignments, on a scale from 0 to 1. By both of these metrics, the clustering was fairly accurate.