diff --git a/CHANGES b/CHANGES index b0ad6c3c..314acc97 100755 --- a/CHANGES +++ b/CHANGES @@ -5,6 +5,9 @@ CHANGE NOTES FOR 0.8.0 (STARTED Oct 23, 2017) ------------------------------------------------------------------------ GENERAL CHANGES: +- Optimization of python implementation of the K-Means algorithm using numpy (pyclustering.cluster.kmeans). + See: https://github.com/annoviko/pyclustering/issues/403 + - Implemented dynamic visualizer for oscillatory networks (pyclustering.nnet.dynamic_visualizer). See: no reference. diff --git a/pyclustering/cluster/examples/kmeans_examples.py b/pyclustering/cluster/examples/kmeans_examples.py index 9b281851..e5ef870c 100644 --- a/pyclustering/cluster/examples/kmeans_examples.py +++ b/pyclustering/cluster/examples/kmeans_examples.py @@ -34,7 +34,7 @@ from pyclustering.utils import timedcall; -def template_clustering(start_centers, path, tolerance = 0.25, ccore = True): +def template_clustering(start_centers, path, tolerance = 0.25, ccore = False): sample = read_sample(path); kmeans_instance = kmeans(sample, start_centers, tolerance, ccore); diff --git a/pyclustering/cluster/kmeans.py b/pyclustering/cluster/kmeans.py index caae5861..9ebe9d73 100755 --- a/pyclustering/cluster/kmeans.py +++ b/pyclustering/cluster/kmeans.py @@ -26,12 +26,12 @@ """ +import numpy; + import pyclustering.core.kmeans_wrapper as wrapper; from pyclustering.cluster.encoder import type_encoding; -from pyclustering.utils import euclidean_distance_sqrt, list_math_addition, list_math_division_number; - class kmeans: """! @@ -72,7 +72,7 @@ class kmeans: """ - def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False): + def __init__(self, data, initial_centers, tolerance = 0.001, ccore = False): """! @brief Constructor of clustering algorithm K-Means. @details For initial centers initializer can be used, for example, K-Means++ method. @@ -85,9 +85,9 @@ def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False): @see center_initializer """ - self.__pointer_data = data; + self.__pointer_data = numpy.matrix(data); self.__clusters = []; - self.__centers = initial_centers[:]; # initial centers shouldn't be chaged + self.__centers = numpy.matrix(initial_centers); self.__tolerance = tolerance; self.__ccore = ccore; @@ -108,7 +108,7 @@ def process(self): self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance); self.__centers = self.__update_centers(); else: - changes = float('inf'); + maximum_change = float('inf'); stop_condition = self.__tolerance * self.__tolerance; # Fast solution #stop_condition = self.__tolerance; # Slow solution @@ -117,13 +117,17 @@ def process(self): if (len(self.__pointer_data[0]) != len(self.__centers[0])): raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.'); - while (changes > stop_condition): + while (maximum_change > stop_condition): self.__clusters = self.__update_clusters(); updated_centers = self.__update_centers(); # changes should be calculated before asignment - - #changes = max([euclidean_distance(self.__centers[index], updated_centers[index]) for index in range(len(self.__centers))]); # Slow solution - changes = max([euclidean_distance_sqrt(self.__centers[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution - + + if (len(self.__centers) != len(updated_centers)): + maximum_change = float('inf'); + + else: + changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1); + maximum_change = numpy.max(changes); + self.__centers = updated_centers; @@ -148,7 +152,7 @@ def get_centers(self): """ - return self.__centers; + return self.__centers.tolist(); def get_cluster_encoding(self): @@ -172,22 +176,17 @@ def __update_clusters(self): """ - clusters = [[] for i in range(len(self.__centers))]; - for index_point in range(len(self.__pointer_data)): - index_optim = -1; - dist_optim = 0.0; - - for index in range(len(self.__centers)): - # dist = euclidean_distance(data[index_point], centers[index]); # Slow solution - dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__centers[index]); # Fast solution - - if ( (dist < dist_optim) or (index is 0)): - index_optim = index; - dist_optim = dist; - - clusters[index_optim].append(index_point); + clusters = [[] for _ in range(len(self.__centers))]; + + dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data))); + for index_center in range(len(self.__centers)): + dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T; + + optimum_indexes = numpy.argmin(dataset_differences, axis=0); + for index_point in range(len(optimum_indexes)): + index_cluster = optimum_indexes[index_point]; + clusters[index_cluster].append(index_point); - # If cluster is not able to capture object it should be removed clusters = [cluster for cluster in clusters if len(cluster) > 0]; return clusters; @@ -197,18 +196,15 @@ def __update_centers(self): """! @brief Calculate centers of clusters in line with contained objects. - @return (list) Updated centers as list of centers. + @return (numpy.matrix) Updated centers as list of centers. """ - - centers = [[] for i in range(len(self.__clusters))]; - + + dimension = self.__pointer_data.shape[1]; + centers = numpy.zeros((len(self.__clusters), dimension)); + for index in range(len(self.__clusters)): - point_sum = [0] * len(self.__pointer_data[0]); - - for index_point in self.__clusters[index]: - point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]); - - centers[index] = list_math_division_number(point_sum, len(self.__clusters[index])); - - return centers; + cluster_points = self.__pointer_data[self.__clusters[index], :]; + centers[index] = cluster_points.mean(axis=0); + + return numpy.matrix(centers);