In [9]:
import numpy as np
from scipy.spatial.distance import euclidean

cluster1 = np.random.uniform(0, 5, 100).reshape((-1, 2))
cluster2 = np.random.uniform(5, 10, 100).reshape((-1, 2))
cluster3 = np.random.uniform(10, 15, 100).reshape((-1, 2))

objs = np.concatenate((cluster1, cluster2, cluster3))


In [10]:
def kmeans(objs, dimension, clusters_number, centers=None, distance_fun=euclidean):
    if centers is None:
        # avoid the situation when any cluster is empty
        while True:
            centers = np.random.uniform(np.min(objs), np.max(objs), clusters_number * dimension).reshape((-1, dimension))
            clusters = split_clusters(objs, clusters_number, centers, distance_fun=distance_fun)
            if all([len(cluster) > 0 for cluster in clusters]):
                break
    else:
        clusters = split_clusters(objs, clusters_number, centers, distance_fun=distance_fun)

    while True:
        # clusters and centers are already initialized
        new_centers = [np.mean(cluster, axis=0) for cluster in clusters]
        if np.equal(centers, new_centers).all():
            return centers, clusters
        centers = new_centers
        clusters = split_clusters(objs, clusters_number, centers, distance_fun=distance_fun)


def split_clusters(objs, clusters_number, centers, distance_fun=euclidean):
    clusters = [[] for i in range(clusters_number)]
    for obj in objs:
        distances = [distance_fun(center, obj) for center in centers]
        clusters[np.argmin(distances)].append(obj)
    return clusters


In [11]:
result_centers, result_clusters = kmeans(objs, 2, 3)
errors = 0
for index, result_center in enumerate(result_centers):
    print(result_center)
    for obj in result_clusters[index]:
        errors += euclidean(obj, result_center)
print()
print("Custom alg: " + str(errors))


[ 12.58903336  12.31265332]
[ 7.52752875  7.17835485]
[ 2.14698725  2.36855705]

Custom alg: 276.5290312149306


In [12]:
from sklearn.cluster import KMeans
sk_kmeans = KMeans(n_clusters=3, random_state=0).fit(objs)
result_clusters = split_clusters(objs, 3, sk_kmeans.cluster_centers_, distance_fun=euclidean)
errors = 0
for index, result_center in enumerate(sk_kmeans.cluster_centers_):
    print(result_center)
    for obj in result_clusters[index]:
        errors += euclidean(obj, result_center)
print()
print("SKLearn alg: " + str(errors))


[ 2.14698725  2.36855705]
[ 12.58903336  12.31265332]
[ 7.52752875  7.17835485]

SKLearn alg: 276.5290312149307
