Skip to content

Commit

Permalink
#403: K-Means optimization using numpy.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Jan 19, 2018
1 parent 7f0d858 commit abe1f6f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 40 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -5,6 +5,9 @@ CHANGE NOTES FOR 0.8.0 (STARTED Oct 23, 2017)
------------------------------------------------------------------------

GENERAL CHANGES:
- Optimization of python implementation of the K-Means algorithm using numpy (pyclustering.cluster.kmeans).
See: https://github.com/annoviko/pyclustering/issues/403

- Implemented dynamic visualizer for oscillatory networks (pyclustering.nnet.dynamic_visualizer).
See: no reference.

Expand Down
2 changes: 1 addition & 1 deletion pyclustering/cluster/examples/kmeans_examples.py
Expand Up @@ -34,7 +34,7 @@
from pyclustering.utils import timedcall;


def template_clustering(start_centers, path, tolerance = 0.25, ccore = True):
def template_clustering(start_centers, path, tolerance = 0.25, ccore = False):
sample = read_sample(path);

kmeans_instance = kmeans(sample, start_centers, tolerance, ccore);
Expand Down
74 changes: 35 additions & 39 deletions pyclustering/cluster/kmeans.py
Expand Up @@ -26,12 +26,12 @@
"""


import numpy;

import pyclustering.core.kmeans_wrapper as wrapper;

from pyclustering.cluster.encoder import type_encoding;

from pyclustering.utils import euclidean_distance_sqrt, list_math_addition, list_math_division_number;


class kmeans:
"""!
Expand Down Expand Up @@ -72,7 +72,7 @@ class kmeans:
"""

def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
def __init__(self, data, initial_centers, tolerance = 0.001, ccore = False):
"""!
@brief Constructor of clustering algorithm K-Means.
@details For initial centers initializer can be used, for example, K-Means++ method.
Expand All @@ -85,9 +85,9 @@ def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
@see center_initializer
"""
self.__pointer_data = data;
self.__pointer_data = numpy.matrix(data);
self.__clusters = [];
self.__centers = initial_centers[:]; # initial centers shouldn't be chaged
self.__centers = numpy.matrix(initial_centers);
self.__tolerance = tolerance;

self.__ccore = ccore;
Expand All @@ -108,7 +108,7 @@ def process(self):
self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
self.__centers = self.__update_centers();
else:
changes = float('inf');
maximum_change = float('inf');

stop_condition = self.__tolerance * self.__tolerance; # Fast solution
#stop_condition = self.__tolerance; # Slow solution
Expand All @@ -117,13 +117,17 @@ def process(self):
if (len(self.__pointer_data[0]) != len(self.__centers[0])):
raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');

while (changes > stop_condition):
while (maximum_change > stop_condition):
self.__clusters = self.__update_clusters();
updated_centers = self.__update_centers(); # changes should be calculated before asignment

#changes = max([euclidean_distance(self.__centers[index], updated_centers[index]) for index in range(len(self.__centers))]); # Slow solution
changes = max([euclidean_distance_sqrt(self.__centers[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution


if (len(self.__centers) != len(updated_centers)):
maximum_change = float('inf');

else:
changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
maximum_change = numpy.max(changes);

self.__centers = updated_centers;


Expand All @@ -148,7 +152,7 @@ def get_centers(self):
"""

return self.__centers;
return self.__centers.tolist();


def get_cluster_encoding(self):
Expand All @@ -172,22 +176,17 @@ def __update_clusters(self):
"""

clusters = [[] for i in range(len(self.__centers))];
for index_point in range(len(self.__pointer_data)):
index_optim = -1;
dist_optim = 0.0;

for index in range(len(self.__centers)):
# dist = euclidean_distance(data[index_point], centers[index]); # Slow solution
dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__centers[index]); # Fast solution

if ( (dist < dist_optim) or (index is 0)):
index_optim = index;
dist_optim = dist;

clusters[index_optim].append(index_point);
clusters = [[] for _ in range(len(self.__centers))];

dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
for index_center in range(len(self.__centers)):
dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;

optimum_indexes = numpy.argmin(dataset_differences, axis=0);
for index_point in range(len(optimum_indexes)):
index_cluster = optimum_indexes[index_point];
clusters[index_cluster].append(index_point);

# If cluster is not able to capture object it should be removed
clusters = [cluster for cluster in clusters if len(cluster) > 0];

return clusters;
Expand All @@ -197,18 +196,15 @@ def __update_centers(self):
"""!
@brief Calculate centers of clusters in line with contained objects.
@return (list) Updated centers as list of centers.
@return (numpy.matrix) Updated centers as list of centers.
"""

centers = [[] for i in range(len(self.__clusters))];


dimension = self.__pointer_data.shape[1];
centers = numpy.zeros((len(self.__clusters), dimension));

for index in range(len(self.__clusters)):
point_sum = [0] * len(self.__pointer_data[0]);

for index_point in self.__clusters[index]:
point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);

centers[index] = list_math_division_number(point_sum, len(self.__clusters[index]));

return centers;
cluster_points = self.__pointer_data[self.__clusters[index], :];
centers[index] = cluster_points.mean(axis=0);

return numpy.matrix(centers);

0 comments on commit abe1f6f

Please sign in to comment.