[pyclustering.cluster.xmeans] Bug correction - MNDL splitting criteri…

…on (issue #328).
annoviko · Mar 25, 2017 · 7c264be · 7c264be
1 parent 100be22
commit 7c264be
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 29 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,9 @@
 Change notes for 0.7.dev0 (started Jun 01, 2016)
 
 GENERAL CHANGES (pyclustering):
+- Improved clustering results in case of usage MNDL splitting criterion for small datasets.
+  See: https://github.com/annoviko/pyclustering/issues/328
+
 - Feature to display connectivity radius on cluster-ordering diagram by ordering_visualizer (pyclustering.cluster.optics).
   See: https://github.com/annoviko/pyclustering/issues/314
 
@@ -42,6 +45,8 @@ GENERAL CHANGES (ccore):
 
 
 CORRECTED MAJOR BUGS:
+- Bug with calculation MNDL splitting criterion for X-Means algorithm (pyclustering.cluster.xmeans).
+  See: https://github.com/annoviko/pyclustering/issues/328
 
 - Bug with loss of CF-nodes in CF-tree during inserting that leads unbalanced CF-tree (pyclustering.container.cftree).
   See: https://github.com/annoviko/pyclustering/issues/304

diff --git a/pyclustering/cluster/tests/xmeans_tests.py b/pyclustering/cluster/tests/xmeans_tests.py
@@ -52,22 +52,28 @@ def templateLengthProcessData(self, path_to_file, start_centers, expected_cluste
 
     def testBicClusterAllocationSampleSimple1(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);
- 
+
     def testBicSampleSimple1WithoutInitialCenters(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);
- 
+
     def testBicClusterAllocationSampleSimple1ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);
- 
+
     def testBicSampleSimple1WithoutInitialCentersByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);
 
     def testBicWrongStartClusterAllocationSampleSimple1(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);
- 
+
     def testBicWrongStartClusterAllocationSampleSimple1ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);
 
+    def testMndlClusterAllocationSampleSimple1(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, False);
+
+    def testMndlSampleSimple1WithoutInitialCenters(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, False);
+
     def testMndlClusterAllocationSampleSimple1ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
 
@@ -79,16 +85,28 @@ def testMndlWrongStartClusterAllocationSampleSimple1ByCore(self):
 
     def testBicClusterAllocationSampleSimple2(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION);
- 
+
     def testBicClusterAllocationSampleSimple2ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);
- 
+
     def testBicWrongStartClusterAllocationSampleSimple2(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION);
- 
+
     def testBicWrongStartClusterAllocationSampleSimple2ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);
-
+
+    def testMndlClusterAllocationSampleSimple2(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);
+
+    def testMndlClusterAllocationSampleSimple2ByCore(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
+
+    def testMndlWrongStartClusterAllocationSampleSimple2(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);
+
+    def testMndlWrongStartClusterAllocationSampleSimple2ByCore(self):
+        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
+
     def testBicClusterAllocationSampleSimple3(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]], [10, 10, 10, 30], splitting_type.BAYESIAN_INFORMATION_CRITERION);    
 
@@ -107,12 +125,6 @@ def testMndlClusterAllocationSampleSimple3(self):
     def testMndlClusterAllocationSampleSimple3ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
 
-    def testMndlWrongStartClusterAllocationSampleSimple3(self):
-        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [5.9, 5.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);   
-
-    def testMndlWrongStartClusterAllocationSampleSimple3ByCore(self):
-        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [5.9, 5.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True); 
-
     def testBicClusterAllocationSampleSimple4(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0], [1.5, 8.0]], [15, 15, 15, 15, 15], splitting_type.BAYESIAN_INFORMATION_CRITERION);
 
@@ -131,12 +143,6 @@ def testMndlClusterAllocationSampleSimple4(self):
     def testMndlClusterAllocationSampleSimple4ByCore(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0], [1.5, 8.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
 
-    def testMndlWrongStartClusterAllocationSampleSimple4(self):
-        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);  
-
-    def testMndlWrongStartClusterAllocationSampleSimple4ByCore(self):
-        self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True); 
-
     def testBicClusterAllocationSampleSimple5(self):
         self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, [[0.0, 1.0], [0.0, 0.0], [1.0, 1.0], [1.0, 0.0]], [15, 15, 15, 15], splitting_type.BAYESIAN_INFORMATION_CRITERION);
 
@@ -179,10 +185,6 @@ def testMndlClusterAllocationSampleTwoDiamonds(self):
     def testMndlClusterAllocationSampleTwoDiamondsByCore(self):
         self.templateLengthProcessData(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, [[0.8, 0.2], [3.0, 0.0]], [400, 400], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
 
-    def testMndlWrongStartClusterAllocationSampleTwoDiamondsByCore(self):
-        self.templateLengthProcessData(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, [[0.8, 0.2]], [400, 400], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);
-
-
     def templateClusterAllocationOneDimensionData(self, ccore_flag):
         input_data = [ [0.0] for i in range(10) ] + [ [5.0] for i in range(10) ] + [ [10.0] for i in range(10) ] + [ [15.0] for i in range(10) ];
 

diff --git a/pyclustering/cluster/xmeans.py b/pyclustering/cluster/xmeans.py
@@ -37,7 +37,7 @@
 
 import pyclustering.core.wrapper as wrapper;
 
-from pyclustering.utils import euclidean_distance_sqrt;
+from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
 from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_division_number;
 
 
@@ -66,6 +66,10 @@ class splitting_type(IntEnum):
     ## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
     ##
     ## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
+    ##
+    ## To improve clustering results some contradiction is introduced:
+    ## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
+    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
     MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
 
 
@@ -319,7 +323,7 @@ def __minimum_noiseless_description_length(self, clusters, centers):
         
         """
 
-        scores = 0.0;
+        scores = float('inf');
 
         W = 0.0;
         K = len(clusters);
@@ -332,15 +336,20 @@ def __minimum_noiseless_description_length(self, clusters, centers):
 
         for index_cluster in range(0, len(clusters), 1):
             Ni = len(clusters[index_cluster]);
+            if (Ni == 0): 
+                return float('inf');
+
             Wi = 0.0;
             for index_object in clusters[index_cluster]:
-                Wi += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
+                # euclidean_distance_sqrt should be used in line with paper, but in this case results are
+                # very poor, therefore square root is used to improved.
+                Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
 
             sigma_sqrt += Wi;
             W += Wi / Ni;
             N += Ni;
 
-        if (N - K != 0):
+        if (N - K > 0):
             sigma_sqrt /= (N - K);
             sigma = sigma_sqrt ** 0.5;
 
@@ -380,7 +389,7 @@ def __bayesian_information_criterion(self, clusters, centers):
 
             N += len(clusters[index_cluster]);
 
-        if (N - K != 0):
+        if (N - K > 0):
             sigma_sqrt /= (N - K);
             p = (K - 1) + dimension * K + 1;