Skip to content

Commit

Permalink
#622: Supported 'distance_matrix' data type for K-Means++.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Feb 10, 2021
1 parent 0052ad0 commit a475f11
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ CHANGE NOTES FOR 0.11.0 (STARTED Nov 26, 2020), (RELEASED: -)

GENERAL CHANGES:

- Supported `distance_matrix` data type for K-Means++ (Python: `pyclustering.cluster.center_initializer`).
See: https://github.com/annoviko/pyclustering/issues/622

- Introduced PAM BUILD algorithm to generate initial medoids (Python: `pyclustering.cluster.kmedoids`, C++: `pyclustering::clst::pam_build`).
See: https://github.com/annoviko/pyclustering/issues/667

Expand Down
2 changes: 1 addition & 1 deletion ccore/include/pyclustering/cluster/pam_build.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class pam_build {
@param[in] p_amount: amount of medoids that should be initialized.
*/
pam_build(const std::size_t p_amount);
explicit pam_build(const std::size_t p_amount);

/*
Expand Down
17 changes: 14 additions & 3 deletions pyclustering/cluster/center_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import random
import warnings

from pyclustering.utils.metric import distance_metric, type_metric


class random_center_initializer:
"""!
Expand Down Expand Up @@ -171,11 +173,12 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
@param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
(with the highest probability) should be considered as centers then special constant should be used
'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3.
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`).
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`, `metric`).
<b>Keyword Args:</b><br>
- random_state (int): Seed for random state (by default is `None`, current system time is used).
- data_type (str): Data type of input sample `data` (`points`, `distance_matrix`).
- metric (distance_metric): Metric that is used for distance calculation between two points.
@see FARTHEST_CENTER_CANDIDATE
Expand All @@ -192,9 +195,14 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
else:
self.__candidates = amount_candidates

random.seed(kwargs.get('random_state', None))
random_seed = kwargs.get('random_state', None)
numpy.random.seed(random_seed)
random.seed(random_seed)

self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.__data_type = kwargs.get('data_type', 'points')

self.__metric.enable_numpy_usage()
self.__check_parameters()


Expand Down Expand Up @@ -231,7 +239,10 @@ def __calculate_shortest_distances(self, data, centers):
for index_center in range(len(centers)):
center = data[centers[index_center]]

dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
if self.__data_type == 'points':
dataset_differences[index_center] = self.__metric(data, center)
elif self.__data_type == 'distance_matrix':
dataset_differences[index_center] = numpy.array(self.__data[centers[index_center]])

with warnings.catch_warnings():
numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
Expand Down
64 changes: 38 additions & 26 deletions pyclustering/cluster/tests/unit/ut_center_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from pyclustering.samples.definitions import SIMPLE_SAMPLES

from pyclustering.utils import read_sample
from pyclustering.utils import read_sample, calculate_distance_matrix, type_metric, distance_metric

from pyclustering.tests.assertion import assertion

Expand Down Expand Up @@ -271,37 +271,49 @@ def testKmeansPlusPlusUniqueCentersSeveralCandidatesSimple02(self):
self.templateKmeansPlusPlusUnique(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 23, 10)


def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates):
sample = read_sample(path_sample)
def template_compare_output(self, path, k, candidates, random_state, metric):
sample = read_sample(path)
matrix = calculate_distance_matrix(sample, metric=metric)

attempts = 10
for _ in range(attempts):
medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True)
result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True)

unique_medoids = set(medoids)
if len(unique_medoids) != len(medoids):
continue
assertion.eq(result1, result2)

def test_various_data_type_simple1(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

def test_various_data_type_simple1_euclidean(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN))

return
def test_various_data_type_simple1_euclidean_square(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts)
def test_various_data_type_simple1_euclidean_manhattan(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN))

def templateKmeansPlusPlusVariousCentersSimple01(self):
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1)
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 5, 1)
def test_various_data_type_simple1_euclidean_chebyshev(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV))

def templateKmeansPlusPlusVariousCentersSeveralCandidatesSimple01(self):
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3)
def test_various_data_type_simple2(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

def templateKmeansPlusPlusVariousCentersFarthestCandidatesSimple01(self):
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 'farthest')
def test_various_data_type_simple3(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

def templateKmeansPlusPlusVariousCentersSimple02(self):
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1)
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 6, 1)
def test_various_data_type_simple4(self):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 2, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

def templateKmeansPlusPlusVariousCentersSimple03(self):
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 4, 1)
self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 8, 1)
def test_various_data_type_simple5(self):
for i in range(10):
self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))

0 comments on commit a475f11

Please sign in to comment.