## K-Means Clustering Experiment

In [1]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
    

### Importing Libraries

In [2]:

# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target
    

We start by importing necessary libraries.

In [3]:

# Implementing k-means algorithm
def k_means(X, k, initial_centroids=None, max_iter=100):
    if initial_centroids is None:
        indices = np.random.choice(X.shape[0], k, replace=False)
        centroids = X[indices]
    else:
        centroids = initial_centroids

    for _ in range(max_iter):
        # Assign clusters
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        closest_cluster = np.argmin(distances, axis=0)

        # Update centroids
        new_centroids = np.array([X[closest_cluster == k].mean(axis=0) for k in range(centroids.shape[0])])
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

    return centroids, closest_cluster
    

### Load Data

In [4]:

# Function to run the experiment with different k and initial centroids
def run_experiment(X, ks, initial_methods):
    results = {}
    for k in ks:
        results[k] = {}
        for method in initial_methods:
            if method == "random":
                centroids = None
            elif method == "first_k":
                centroids = X[:k]
            elif method == "k_hierarchical":
                # Placeholder for hierarchical method to generate initial centroids
                from scipy.cluster.hierarchy import linkage, fcluster
                Z = linkage(X, 'ward')
                initial_clusters = fcluster(Z, k, criterion='maxclust')
                centroids = np.array([X[initial_clusters == i+1].mean(axis=0) for i in range(k)])
            
            final_centroids, clusters = k_means(X, k, initial_centroids=centroids)
            score = silhouette_score(X, clusters)
            results[k][method] = (final_centroids, clusters, score)
    return results
    

Load the Iris dataset to apply K-Means clustering.

In [5]:

# Settings for the experiment
ks = [3, 5, 8]
initial_methods = ["random", "first_k", "k_hierarchical"]
results = run_experiment(X, ks, initial_methods)
results
    

{3: {'random': (array([[6.85384615, 3.07692308, 5.71538462, 2.05384615],
          [5.006     , 3.428     , 1.462     , 0.246     ],
          [5.88360656, 2.74098361, 4.38852459, 1.43442623]]),
   array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
          0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
          0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int64),
   0.551191604619592),
  'first_k': (array([[6.85384615, 3.07692308, 5.71538462, 2.05384615],
          [5.88360656, 2.74098361, 4.38852459, 1.43442623],
          [5.006     , 3.428     , 1.462     , 0.246     ]]),
   array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,