#622: Supported 'distance_matrix' data type for K-Means++.

annoviko · Feb 10, 2021 · a475f11 · a475f11
1 parent 0052ad0
commit a475f11
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 30 deletions.
diff --git a/CHANGES b/CHANGES
@@ -6,6 +6,9 @@ CHANGE NOTES FOR 0.11.0 (STARTED Nov 26, 2020), (RELEASED: -)
 
 GENERAL CHANGES:
 
+- Supported `distance_matrix` data type for K-Means++ (Python: `pyclustering.cluster.center_initializer`).
+  See: https://github.com/annoviko/pyclustering/issues/622
+
 - Introduced PAM BUILD algorithm to generate initial medoids (Python: `pyclustering.cluster.kmedoids`, C++: `pyclustering::clst::pam_build`).
   See: https://github.com/annoviko/pyclustering/issues/667
 

diff --git a/ccore/include/pyclustering/cluster/pam_build.hpp b/ccore/include/pyclustering/cluster/pam_build.hpp
@@ -106,7 +106,7 @@ class pam_build {
     @param[in] p_amount: amount of medoids that should be initialized.
 
     */
-    pam_build(const std::size_t p_amount);
+    explicit pam_build(const std::size_t p_amount);
 
     /*
 

diff --git a/pyclustering/cluster/center_initializer.py b/pyclustering/cluster/center_initializer.py
@@ -16,6 +16,8 @@
 import random
 import warnings
 
+from pyclustering.utils.metric import distance_metric, type_metric
+
 
 class random_center_initializer:
     """!
@@ -171,11 +173,12 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
         @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points
                     (with the highest probability) should be considered as centers then special constant should be used
                     'FARTHEST_CENTER_CANDIDATE'. By default the amount of candidates is 3.
-        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`).
+        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `random_state`, `data_type`, `metric`).
 
         <b>Keyword Args:</b><br>
             - random_state (int): Seed for random state (by default is `None`, current system time is used).
             - data_type (str): Data type of input sample `data` (`points`, `distance_matrix`).
+            - metric (distance_metric): Metric that is used for distance calculation between two points.
 
         @see FARTHEST_CENTER_CANDIDATE
 
@@ -192,9 +195,14 @@ def __init__(self, data, amount_centers, amount_candidates=None, **kwargs):
         else:
             self.__candidates = amount_candidates
 
-        random.seed(kwargs.get('random_state', None))
+        random_seed = kwargs.get('random_state', None)
+        numpy.random.seed(random_seed)
+        random.seed(random_seed)
+
+        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
         self.__data_type = kwargs.get('data_type', 'points')
 
+        self.__metric.enable_numpy_usage()
         self.__check_parameters()
 
 
@@ -231,7 +239,10 @@ def __calculate_shortest_distances(self, data, centers):
         for index_center in range(len(centers)):
             center = data[centers[index_center]]
 
-            dataset_differences[index_center] = numpy.sum(numpy.square(data - center), axis=1).T
+            if self.__data_type == 'points':
+                dataset_differences[index_center] = self.__metric(data, center)
+            elif self.__data_type == 'distance_matrix':
+                dataset_differences[index_center] = numpy.array(self.__data[centers[index_center]])
 
         with warnings.catch_warnings():
             numpy.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

diff --git a/pyclustering/cluster/tests/unit/ut_center_initializer.py b/pyclustering/cluster/tests/unit/ut_center_initializer.py
@@ -23,7 +23,7 @@
 
 from pyclustering.samples.definitions import SIMPLE_SAMPLES
 
-from pyclustering.utils import read_sample
+from pyclustering.utils import read_sample, calculate_distance_matrix, type_metric, distance_metric
 
 from pyclustering.tests.assertion import assertion
 
@@ -271,37 +271,49 @@ def testKmeansPlusPlusUniqueCentersSeveralCandidatesSimple02(self):
         self.templateKmeansPlusPlusUnique(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 23, 10)
 
 
-    def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates):
-        sample = read_sample(path_sample)
+    def template_compare_output(self, path, k, candidates, random_state, metric):
+        sample = read_sample(path)
+        matrix = calculate_distance_matrix(sample, metric=metric)
 
-        attempts = 10
-        for _ in range(attempts):
-            medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
-            medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
-            medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True)
+        result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True)
+        result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True)
 
-            unique_medoids = set(medoids)
-            if len(unique_medoids) != len(medoids):
-                continue
+        assertion.eq(result1, result2)
+
+    def test_various_data_type_simple1(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 3, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 4, 3, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+
+    def test_various_data_type_simple1_euclidean(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN))
 
-            return
+    def test_various_data_type_simple1_euclidean_square(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
 
-        self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts)
+    def test_various_data_type_simple1_euclidean_manhattan(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.MANHATTAN))
 
-    def templateKmeansPlusPlusVariousCentersSimple01(self):
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1)
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 5, 1)
+    def test_various_data_type_simple1_euclidean_chebyshev(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 1, 1000, distance_metric(type_metric.CHEBYSHEV))
 
-    def templateKmeansPlusPlusVariousCentersSeveralCandidatesSimple01(self):
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 3)
+    def test_various_data_type_simple2(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
 
-    def templateKmeansPlusPlusVariousCentersFarthestCandidatesSimple01(self):
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, 2, 'farthest')
+    def test_various_data_type_simple3(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, 4, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
 
-    def templateKmeansPlusPlusVariousCentersSimple02(self):
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 3, 1)
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 6, 1)
+    def test_various_data_type_simple4(self):
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 2, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
+        self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, 5, 5, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))
 
-    def templateKmeansPlusPlusVariousCentersSimple03(self):
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 4, 1)
-        self.templateKmeansPlusPlusSeveralRuns(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, 8, 1)
+    def test_various_data_type_simple5(self):
+        for i in range(10):
+            self.template_compare_output(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, i + 2, 1, 1000, distance_metric(type_metric.EUCLIDEAN_SQUARE))