In [None]:
import numpy as np
from time import time
from lloyd import update_lloyd
from utils import calc_sq_distances, fill_empty_clusters
from kernels import build_kernel_matrix
from quality import calc_silhouettes
from elkan import update_elkan, start_elkan


In [None]:
from kernel_kmeans import KKMeans

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_circles
from sklearn.cluster import KMeans

In [None]:
def visualize(data, labels):
    if len(data[0]) > 3:
        raise Exception("Dimensionality is too high for visualization")
    elif len(data[0]) == 1:
        plt.scatter(data, [0 for x in range(len(data))], c = labels)
    elif len(data[0]) == 2:
        plt.scatter(data[:,0], data[:,1], c = labels)
    elif len(data[0]) == 3:
        fig = plt.figure()
        ax = fig.add_subplot(projection = "3d")
        ax.scatter(data[:,0], data[:,1], data[:,2], c = labels)

In [None]:
n_samples = 500
n_features = 3
n_clusters = 300
iters = 20
for seed in range(iters):
    data, labels = make_blobs(n_samples, n_features, centers=n_clusters, random_state=seed)
    kkm = KKMeans(n_clusters=n_clusters, init="random", algorithm="lloyd", rng=seed)
    kkme = KKMeans(n_clusters=n_clusters, init="random", algorithm="elkan", rng=seed)
    kkm.fit(data)
    kkme.fit(data)
    assert all(kkm.labels_ == kkme.labels_)
    assert np.isclose(kkm.quality_, kkme.quality_)
    assert np.isclose(kkm.quality_, kkme.quality_)



In [None]:
kkm = KKMeans(n_clusters=100, init = "kmeans++", n_init=10, max_iter=100, algorithm="lloyd", verbose=False, rng=0, q_metric="inertia")

In [None]:
kkm.fit(data)

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=n_clusters, init="random")
km.fit(data)

In [None]:
km.inertia_

In [None]:
kkm = KKMeans(n_clusters = 5, verbose = False, init = "kmeans++", kernel = "linear", rng = 0, q_metric="silhouette", n_init=3, tol = 1e-4, algorithm="lloyd")

In [None]:
start = time()
kkm.fit(x)
end = time()
print(end - start)
visualize(x, kkm.labels_)

In [None]:
x, l = make_circles(7000, factor = 0.4, random_state=0)

In [None]:
kkm = KKMeans(n_clusters = 2, verbose = True, init = "kmeans++", kernel = "rbf", rng = 0, tol = 1e-4, variance=0.4, n_init = 5, q_metric="inertia", algorithm="lloyd")

In [None]:
start = time()
kkm.fit(x)
end = time()
print(end - start)
visualize(x, kkm.labels_)

In [None]:
visualize(x, kkm.predict(x))

In [None]:
all(kkm.predict(x) == kkm.labels_)

In [None]:
n_c = 75
size = 10000
r_s = 0
x, labels, centers = make_blobs(size, centers=n_c, return_centers = True, random_state = r_s, n_features = 2)

In [None]:
seed = 0
tol = 1e-4
inits= 3
initmethod = "kmeans++"
kkml =  KKMeans(n_clusters = n_c, verbose = True, init =initmethod, kernel = "linear", rng=seed, tol=tol, n_init=inits, q_metric="inertia", algorithm="lloyd")
kkme =  KKMeans(n_clusters = n_c, verbose = True, init =initmethod, kernel = "linear", rng=seed, tol=tol, n_init=inits, q_metric="inertia", algorithm="elkan")

In [None]:
start = time()
kkml.fit(x)
end = time()
print(end - start)
visualize(x, kkml.labels_)

In [None]:
start = time()
kkme.fit(x)
end = time()
print(end - start)
visualize(x, kkme.labels_)

In [None]:
print(all(kkme.labels_ == kkml.labels_))

In [None]:
iter = 20
elkan_store = []
lloyd_store = []

n_c = 200
size = 2000

tol = 1e-4
inits= 3
initmethod = "kmeans++"
verb = False
max_iter = 100

for i in range(iter):
    print(i)
    x, labels, centers = make_blobs(size, centers=n_c, return_centers = True, random_state = i, n_features = 2)

    kkml =  KKMeans(n_clusters = n_c, verbose=verb, init =initmethod, kernel = "linear", rng=i, tol=tol, n_init=inits, q_metric="inertia", algorithm="lloyd", max_iter=max_iter)
    kkme =  KKMeans(n_clusters = n_c, verbose=verb, init =initmethod, kernel = "linear", rng=i, tol=tol, n_init=inits, q_metric="inertia", algorithm="elkan", max_iter = max_iter)

    start = time()
    kkml.fit(x)
    end = time()
    lloyd_store.append(end - start)
    start = time()
    kkme.fit(x)
    end = time()
    elkan_store.append(end - start)
    assert(all(kkml.labels_ == kkme.labels_))
    assert(np.isclose(kkml.quality_, kkme.quality_))
    

print(sum(elkan_store)/len(elkan_store))
print(sum(lloyd_store)/len(lloyd_store))

In [None]:
print(sum(elkan_store)/len(elkan_store))
print(sum(lloyd_store)/len(lloyd_store))

In [None]:
visualize(x, kkml.labels_)

In [None]:
visualize(x, labels)

In [None]:
help(KKMeans)