In [1]:
import numpy as np

In [2]:
'''
Steps:
1. Initialize the centroids by randomly selecting a subset of the data points
2. Calculate distance of each data point from the centroids and assign the nearest centroid to the data point
3. Update the centroid as the mean of the points associated to that centroid
4. Repeat steps 2 and 3 until convergence - the mean doesnt change beyond a threshold or, within a set number of iterations
'''

'\nSteps:\n1. Initialize the centroids by randomly selecting a subset of the data points\n2. Calculate distance of each data point from the centroids and assign the nearest centroid to the data point\n3. Update the centroid as the mean of the points associated to that centroid\n4. Repeat steps 2 and 3 until convergence - the mean doesnt change beyond a threshold or, within a set number of iterations\n'

In [54]:
#Initiaize the centroids

def initializeCentroids(X,k)->list:
    '''
    Input: X -> input data points
            k -> number of cluster

    Output: an array of size (k,X.shape[1])
    '''

    indices =  np.random.choice(X.shape[0],k, replace=False)
    return X[indices]

In [55]:
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
k = 2
centroids = initializeCentroids(X, k)
print("Initial centroids:\n", centroids)

Initial centroids:
 [[1 2]
 [4 4]]


In [56]:
#Assign clusters

def assignCluster(X,centroids):
    '''
    Input: X-> input data
           centroids -> centroids
    Output:

    '''

    for x in X:
        distance = np.sqrt(((X-centroids[:,np.newaxis])**2).sum(axis=2))
        return np.argmin(distance, axis=0)

In [57]:
assignCluster(X,centroids)

array([0, 0, 0, 1, 1, 0])

In [58]:
#Update clusters to the mean of points assigned to that cluster
def updateCentroids(X, labels, k):
    """Update the centroids as the mean of the assigned data points."""
    new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
    return new_centroids

In [61]:
def kmeans(X, k, max_iters=100, tol=1e-4):
    """Perform K-means clustering on the dataset X."""
    centroids = initializeCentroids(X, k)
    for _ in range(max_iters):
        labels = assignCluster(X, centroids)
        new_centroids = updateCentroids(X, labels, k)
        # Check for convergence (if centroids do not change)
        if np.all(np.abs(new_centroids - centroids) < tol):
            break
        centroids = new_centroids
    return centroids, labels

In [62]:
kmeans(X,3, max_iters=100)

(array([[4., 4.],
        [4., 1.],
        [1., 2.]]),
 array([2, 2, 2, 1, 0, 1]))