Skip to content

Commit

Permalink
[pyclustering.cluster.xmeans] Bug correction - MNDL splitting criteri…
Browse files Browse the repository at this point in the history
…on (issue #328).
  • Loading branch information
annoviko committed Mar 25, 2017
1 parent 100be22 commit 7c264be
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 29 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Change notes for 0.7.dev0 (started Jun 01, 2016)

GENERAL CHANGES (pyclustering):
- Improved clustering results in case of usage MNDL splitting criterion for small datasets.
See: https://github.com/annoviko/pyclustering/issues/328

- Feature to display connectivity radius on cluster-ordering diagram by ordering_visualizer (pyclustering.cluster.optics).
See: https://github.com/annoviko/pyclustering/issues/314

Expand Down Expand Up @@ -42,6 +45,8 @@ GENERAL CHANGES (ccore):


CORRECTED MAJOR BUGS:
- Bug with calculation MNDL splitting criterion for X-Means algorithm (pyclustering.cluster.xmeans).
See: https://github.com/annoviko/pyclustering/issues/328

- Bug with loss of CF-nodes in CF-tree during inserting that leads unbalanced CF-tree (pyclustering.container.cftree).
See: https://github.com/annoviko/pyclustering/issues/304
Expand Down
50 changes: 26 additions & 24 deletions pyclustering/cluster/tests/xmeans_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,28 @@ def templateLengthProcessData(self, path_to_file, start_centers, expected_cluste

def testBicClusterAllocationSampleSimple1(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);

def testBicSampleSimple1WithoutInitialCenters(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);

def testBicClusterAllocationSampleSimple1ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);

def testBicSampleSimple1WithoutInitialCentersByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);

def testBicWrongStartClusterAllocationSampleSimple1(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION);

def testBicWrongStartClusterAllocationSampleSimple1ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5]], [5, 5], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);

def testMndlClusterAllocationSampleSimple1(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, False);

def testMndlSampleSimple1WithoutInitialCenters(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, None, [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, False);

def testMndlClusterAllocationSampleSimple1ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, [[3.7, 5.5], [6.7, 7.5]], [5, 5], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

Expand All @@ -79,16 +85,28 @@ def testMndlWrongStartClusterAllocationSampleSimple1ByCore(self):

def testBicClusterAllocationSampleSimple2(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION);

def testBicClusterAllocationSampleSimple2ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);

def testBicWrongStartClusterAllocationSampleSimple2(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION);

def testBicWrongStartClusterAllocationSampleSimple2ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.BAYESIAN_INFORMATION_CRITERION, True);


def testMndlClusterAllocationSampleSimple2(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);

def testMndlClusterAllocationSampleSimple2ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7], [7.5, 0.5]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testMndlWrongStartClusterAllocationSampleSimple2(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);

def testMndlWrongStartClusterAllocationSampleSimple2ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [[3.5, 4.8], [6.9, 7]], [10, 5, 8], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testBicClusterAllocationSampleSimple3(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]], [10, 10, 10, 30], splitting_type.BAYESIAN_INFORMATION_CRITERION);

Expand All @@ -107,12 +125,6 @@ def testMndlClusterAllocationSampleSimple3(self):
def testMndlClusterAllocationSampleSimple3ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [2.0, 2.0], [2.3, 3.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testMndlWrongStartClusterAllocationSampleSimple3(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [5.9, 5.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);

def testMndlWrongStartClusterAllocationSampleSimple3ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, [[0.2, 0.1], [4.0, 1.0], [5.9, 5.9]], [10, 10, 10, 30], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testBicClusterAllocationSampleSimple4(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0], [1.5, 8.0]], [15, 15, 15, 15, 15], splitting_type.BAYESIAN_INFORMATION_CRITERION);

Expand All @@ -131,12 +143,6 @@ def testMndlClusterAllocationSampleSimple4(self):
def testMndlClusterAllocationSampleSimple4ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0], [1.5, 8.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testMndlWrongStartClusterAllocationSampleSimple4(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH);

def testMndlWrongStartClusterAllocationSampleSimple4ByCore(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, [[1.5, 0.0], [1.5, 2.0], [1.5, 4.0], [1.5, 6.0]], [15, 15, 15, 15, 15], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testBicClusterAllocationSampleSimple5(self):
self.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, [[0.0, 1.0], [0.0, 0.0], [1.0, 1.0], [1.0, 0.0]], [15, 15, 15, 15], splitting_type.BAYESIAN_INFORMATION_CRITERION);

Expand Down Expand Up @@ -179,10 +185,6 @@ def testMndlClusterAllocationSampleTwoDiamonds(self):
def testMndlClusterAllocationSampleTwoDiamondsByCore(self):
self.templateLengthProcessData(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, [[0.8, 0.2], [3.0, 0.0]], [400, 400], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);

def testMndlWrongStartClusterAllocationSampleTwoDiamondsByCore(self):
self.templateLengthProcessData(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS, [[0.8, 0.2]], [400, 400], splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH, True);


def templateClusterAllocationOneDimensionData(self, ccore_flag):
input_data = [ [0.0] for i in range(10) ] + [ [5.0] for i in range(10) ] + [ [10.0] for i in range(10) ] + [ [15.0] for i in range(10) ];

Expand Down
19 changes: 14 additions & 5 deletions pyclustering/cluster/xmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

import pyclustering.core.wrapper as wrapper;

from pyclustering.utils import euclidean_distance_sqrt;
from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_division_number;


Expand Down Expand Up @@ -66,6 +66,10 @@ class splitting_type(IntEnum):
## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
##
## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
##
## To improve clustering results some contradiction is introduced:
## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;


Expand Down Expand Up @@ -319,7 +323,7 @@ def __minimum_noiseless_description_length(self, clusters, centers):
"""

scores = 0.0;
scores = float('inf');

W = 0.0;
K = len(clusters);
Expand All @@ -332,15 +336,20 @@ def __minimum_noiseless_description_length(self, clusters, centers):

for index_cluster in range(0, len(clusters), 1):
Ni = len(clusters[index_cluster]);
if (Ni == 0):
return float('inf');

Wi = 0.0;
for index_object in clusters[index_cluster]:
Wi += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
# euclidean_distance_sqrt should be used in line with paper, but in this case results are
# very poor, therefore square root is used to improved.
Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);

sigma_sqrt += Wi;
W += Wi / Ni;
N += Ni;

if (N - K != 0):
if (N - K > 0):
sigma_sqrt /= (N - K);
sigma = sigma_sqrt ** 0.5;

Expand Down Expand Up @@ -380,7 +389,7 @@ def __bayesian_information_criterion(self, clusters, centers):

N += len(clusters[index_cluster]);

if (N - K != 0):
if (N - K > 0):
sigma_sqrt /= (N - K);
p = (K - 1) + dimension * K + 1;

Expand Down

0 comments on commit 7c264be

Please sign in to comment.