In [7]:
import numpy as np
from sklearn.cluster import KMeans

In [3]:
K = 3

points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])),
                  (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])),
                  (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5]))))

In [4]:
def euclidean_dist(x, y):
    return np.sqrt((x[0] - y[0])**2 + (x[1] - y[1])**2)

def cluster_euclidean_dist(cluster_point, centroids):
    return np.array(list(map(lambda point : euclidean_dist(centroids, point), cluster_point)))

def cost_fun(centroids, closest_cluster, K):
    cluster_point = list(map(lambda x : points[np.where(closest_cluster == x)], np.arange(0, K)))
    return np.sum(np.concatenate(list(map(lambda x: cluster_euclidean_dist(cluster_point[x], centroids[x]),np.arange(0, K))), axis= None))

In [43]:
# TODO : Randomly select seed and save it down to make the result reproducable
def K_means(points, n_clusters,  num_iter, num_trials):
    seed_list = []
    cost_list = []

    for i in range(num_trials):
        seed = np.random.randint(100)
        np.random.seed(seed)
        seed_list.append(seed)

        N = points.shape[0]
        label = np.random.randint(n_clusters, size = N)
        centroids = np.array(list(map(lambda x : np.mean(points[np.where(label == x)], axis = 0), np.arange(0, K))))
        for i in range(num_iter):
            # Calculate the Euclidean Distance between each point with centroids
            function = lambda point : np.argmin(list(map(lambda x : euclidean_dist(x, point),centroids)))
            # Assign the cluster label of points based on teh distance
            closest_cluster = np.array(list(map(function, points)))
            # Calculate the cost of the cost of this iteration
            cost = cost_fun(centroids, closest_cluster, K)
            # Updata Centroids after one iteration
            centroids = np.array(list(map(lambda x : np.mean(points[np.where(closest_cluster == x)], axis = 0), np.arange(0, K))))
        cost_list.append(cost)
    
    best_seed = seed_list[np.argmin(np.array(cost_list))]
    
    np.random.seed(best_seed)
    label = np.random.randint(n_clusters, size = N)
    centroids = np.array(list(map(lambda x : np.mean(points[np.where(label == x)], axis = 0), np.arange(0, K))))
    for i in range(num_iter):
        # Calculate the Euclidean Distance between each point with centroids
        function = lambda point : np.argmin(list(map(lambda x : euclidean_dist(x, point),centroids)))
        # Assign the cluster label of points based on teh distance
        closest_cluster = np.array(list(map(function, points)))
        # Calculate the cost of the cost of this iteration
        cost = cost_fun(centroids, closest_cluster, K)
        # Updata Centroids after one iteration
        centroids = np.array(list(map(lambda x : np.mean(points[np.where(closest_cluster == x)], axis = 0), np.arange(0, K))))

    return closest_cluster, centroids


In [57]:
k_mean_result = K_means(points,n_clusters = 3, num_iter =  10, num_trials = 10)

In [58]:
k_mean_result[0]

array([2, 2, 0, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 0, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1,
       1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1,
       2, 2, 2, 2, 2, 0, 2, 0, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1,
       1, 1, 0, 2, 2, 2, 1, 1, 0, 2, 1, 0, 2, 0, 1, 2, 1, 2, 2, 1, 0, 2,
       1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2,
       1, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0,
       0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2,
       0, 0, 0, 0, 0, 0, 2, 0])

In [53]:
kmeans = KMeans(n_clusters=3).fit(points)

In [56]:
kmeans.labels_

array([0, 0, 1, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 2, 0,
       2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2,
       2, 1, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 1, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2,
       2, 2, 1, 0, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 2, 0, 2, 1, 0, 2, 1, 0,
       2, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0,
       2, 0, 2, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 0, 1, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)