#388, #389: Corrections for X-Means algorithm (Python and C/C++ imple…

…mentation).
annoviko · Nov 9, 2017 · 56833ff · 56833ff
1 parent 9f8cb7a
commit 56833ff
Show file tree

Hide file tree

Showing 11 changed files with 267 additions and 100 deletions.
diff --git a/CHANGES b/CHANGES
@@ -8,27 +8,33 @@ GENERAL CHANGES:
 - Implemented method 'get_probabilities()' for obtaining belong probability in EM-algorithm (pyclustering.cluster.ema).
   See: https://github.com/annoviko/pyclustering/issues/387
 
-- Python implementation of CURE algorithm method 'get_clusters()' returns list of indexes (pyclustering.cluster.cure)
+- Python implementation of CURE algorithm method 'get_clusters()' returns list of indexes (pyclustering.cluster.cure).
   See: https://github.com/annoviko/pyclustering/issues/384
 
-- Implemented parallel processing for X-Means algorithm (ccore.xmeans)
+- Implemented parallel processing for X-Means algorithm (ccore.xmeans).
   See: https://github.com/annoviko/pyclustering/issues/372
 
-- Implemented pool threads for parallel processing (ccore.parallel)
+- Implemented pool threads for parallel processing (ccore.parallel).
   See: https://github.com/annoviko/pyclustering/issues/383
 
-- Implemented AntMean clustering algorithm (pyclustering.cluster.antmean, ccore.antmean)
+- Implemented AntMean clustering algorithm (pyclustering.cluster.antmean, ccore.antmean).
   See: https://github.com/annoviko/pyclustering/issues/278
 
-- Optimization of OPTICS algorithm using KD-tree for searching nearest neighbors (pyclustering.cluster.optics, ccore.optics)
+- Optimization of OPTICS algorithm using KD-tree for searching nearest neighbors (pyclustering.cluster.optics, ccore.optics).
   See: https://github.com/annoviko/pyclustering/issues/370
 
-- Optimization of DBSCAN algorithm using KD-tree for searching nearest neighbors (pyclustering.cluster.dbscan, ccore.dbscan)
+- Optimization of DBSCAN algorithm using KD-tree for searching nearest neighbors (pyclustering.cluster.dbscan, ccore.dbscan).
   See: https://github.com/annoviko/pyclustering/issues/369
 
 
 CORRECTED MAJOR BUGS:
-- Corrected bug with returned nullptr in method 'kdtree_searcher::find_nearest_node()' (ccore.container.kdtree)
+- Amount of allocated clusters can be differ from amount of centers in X-Means algorithm (ccore.xmeans).
+  See: https://github.com/annoviko/pyclustering/issues/389
+
+- Amount of allocated clusters can be bigger than kmax in X-Means algorithm (pyclustering.cluster.xmeans, ccore.xmeans).
+  See: https://github.com/annoviko/pyclustering/issues/388
+
+- Corrected bug with returned nullptr in method 'kdtree_searcher::find_nearest_node()' (ccore.container.kdtree).
   See: no reference.
 
 
@@ -62,10 +68,10 @@ CHANGE NOTES FOR 0.7.0 (STARTED Jun 01, 2016), (RELEASED: Oct 16, 2017)
 ------------------------------------------------------------------------
 
 GENERAL CHANGES (pyclustering):
-- Implemented Expectation-Maximization clustering algorithm for Gaussian Mixute Model and clustering visualizer for this particular algorithm (pyclustering.cluster.ema)
+- Implemented Expectation-Maximization clustering algorithm for Gaussian Mixute Model and clustering visualizer for this particular algorithm (pyclustering.cluster.ema).
   See: https://github.com/annoviko/pyclustering/issues/16
 
-- Implemented Genetic Clustering Algorithm (GCA) and clustering visualizer for this particular algorithm (pyclustering.cluster.ga)
+- Implemented Genetic Clustering Algorithm (GCA) and clustering visualizer for this particular algorithm (pyclustering.cluster.ga).
   See: https://github.com/annoviko/pyclustering/issues/360
 
 - Implemented feature to obtain and visualize evolution of order parameter and local order parameter for Sync network and Sync-based algorithms (pyclustering.nnet.sync).

diff --git a/ccore/src/cluster/xmeans.cpp b/ccore/src/cluster/xmeans.cpp
@@ -69,7 +69,7 @@ void xmeans::process(const dataset & data, cluster_data & output_result) {
     size_t current_number_clusters = m_ptr_result->centers()->size();
     const index_sequence dummy;
 
-    while (current_number_clusters < m_maximum_clusters) {
+    while (current_number_clusters <= m_maximum_clusters) {
         improve_parameters(*(m_ptr_result->clusters()), m_centers, dummy);
         improve_structure();
 
@@ -79,6 +79,8 @@ void xmeans::process(const dataset & data, cluster_data & output_result) {
 
         current_number_clusters = m_centers.size();
     }
+
+    *(m_ptr_result->centers().get()) = std::move(m_centers);
 }
 
 
@@ -98,10 +100,10 @@ void xmeans::improve_parameters(cluster_sequence & improved_clusters, dataset &
 
 
 void xmeans::improve_structure() {
-    if (m_parallel_processing) {
-        cluster_sequence & clusters = *(m_ptr_result->clusters());
-        std::vector<dataset> region_allocated_centers(m_ptr_result->clusters()->size(), dataset());
+    cluster_sequence & clusters = *(m_ptr_result->clusters());
+    std::vector<dataset> region_allocated_centers(m_ptr_result->clusters()->size(), dataset());
 
+    if (m_parallel_processing) {
         for (std::size_t index = 0; index < m_ptr_result->clusters()->size(); index++) {
             task::proc improve_proc = [this, index, &clusters, &region_allocated_centers](){
                     improve_region_structure(clusters[index], m_centers[index], region_allocated_centers[index]);
@@ -113,28 +115,34 @@ void xmeans::improve_structure() {
         for (std::size_t i = 0; i < m_ptr_result->clusters()->size(); i++) {
             m_pool.pop_complete_task();
         }
-
-        /* update current centers */
-        m_centers.clear();
-        for (auto & centers : region_allocated_centers) {
-            for (auto & center : centers) {
-                m_centers.push_back(center);
-            }
-        }
     }
     else {
         dataset allocated_centers;
 
         for (std::size_t index = 0; index < m_ptr_result->clusters()->size(); index++) {
-            improve_region_structure((*(m_ptr_result->clusters()))[index], m_centers[index], allocated_centers);
+            improve_region_structure((*(m_ptr_result->clusters()))[index], m_centers[index], region_allocated_centers[index]);
         }
+    }
+
+    /* update current centers */
+    dataset allocated_centers = { };
+    std::size_t amount_free_centers = m_maximum_clusters - clusters.size();
+
+    for (std::size_t index_cluster = 0; index_cluster < region_allocated_centers.size(); index_cluster++) {
+        dataset & centers = region_allocated_centers[index_cluster];
+        if ( (centers.size() > 1) && (amount_free_centers > 0) ) {
+            /* separate cluster */
+            allocated_centers.push_back(centers[0]);
+            allocated_centers.push_back(centers[1]);
 
-        /* update current centers */
-        m_centers.clear();
-        for (std::size_t index = 0; index < allocated_centers.size(); index++) {
-            m_centers.push_back(allocated_centers[index]);
+            amount_free_centers--;
+        }
+        else {
+            allocated_centers.push_back(m_centers[index_cluster]);
         }
     }
+
+    m_centers = std::move(allocated_centers);
 }
 
 

diff --git a/ccore/src/interface/dbscan_interface.cpp b/ccore/src/interface/dbscan_interface.cpp
@@ -37,7 +37,7 @@ pyclustering_package * dbscan_algorithm(const pyclustering_package * const sampl
     package->size = output_result.size() + 1;   /* the last for noise */
     package->data = new pyclustering_package * [package->size + 1];
 
-    for (unsigned int i = 0; i < package->size - 1; i++) {
+    for (std::size_t i = 0; i < package->size - 1; i++) {
         ((pyclustering_package **) package->data)[i] = create_package(&output_result[i]);
     }
 

diff --git a/ccore/src/interface/xmeans_interface.cpp b/ccore/src/interface/xmeans_interface.cpp
@@ -32,8 +32,14 @@ pyclustering_package * xmeans_algorithm(const pyclustering_package * const p_sam
     cluster_analysis::xmeans solver(centers, p_kmax, p_tolerance, (cluster_analysis::splitting_type) p_criterion);
 
     cluster_analysis::xmeans_data output_result;
-    solver.process(data, output_result);
-
-    pyclustering_package * package = create_package(output_result.clusters().get());
+    solver.process(data, output_result);
+
+    pyclustering_package * package = new pyclustering_package(pyclustering_type_data::PYCLUSTERING_TYPE_LIST);
+    package->size = 2;   /* cluster package + center package */
+    package->data = new pyclustering_package * [2];
+
+    ((pyclustering_package **) package->data)[0] = create_package(output_result.clusters().get());
+    ((pyclustering_package **) package->data)[1] = create_package(output_result.centers().get());
+
     return package;
 }
diff --git a/ccore/tst/utest-interface-xmeans.cpp b/ccore/tst/utest-interface-xmeans.cpp
@@ -35,5 +35,11 @@ TEST(utest_interface_xmeans, xmeans_algorithm) {
     pyclustering_package * result = xmeans_algorithm(sample.get(), centers.get(), 5, 0.01, 0);
     ASSERT_EQ(2, result->size);
 
+    pyclustering_package * obtained_clusters = ((pyclustering_package **) result->data)[0];
+    ASSERT_EQ(2, obtained_clusters->size);
+
+    pyclustering_package * obtained_centers = ((pyclustering_package **) result->data)[1];
+    ASSERT_EQ(2, obtained_centers->size);
+
     delete result;
 }
diff --git a/ccore/tst/utest-xmeans.cpp b/ccore/tst/utest-xmeans.cpp
@@ -60,6 +60,8 @@ template_length_process_data(const std::shared_ptr<dataset> & data,
     }
 
     ASSERT_EQ(data->size(), total_size);
+    ASSERT_EQ(output_result.centers()->size(), output_result.clusters()->size());
+    ASSERT_GE(kmax, output_result.centers()->size());
 
     if (!expected_cluster_length.empty()) {
         std::sort(obtained_cluster_length.begin(), obtained_cluster_length.end());
@@ -109,6 +111,27 @@ TEST(utest_xmeans, allocation_bic_sample_simple_03) {
 }
 
 
+TEST(utest_xmeans, allocation_wrong_initial_bic_sample_simple_03) {
+    dataset start_centers = { {4.0, 1.0}, {2.0, 2.0}, {2.3, 3.9} };
+    std::vector<unsigned int> expected_clusters_length = {10, 10, 10, 30};
+    template_length_process_data(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_03), start_centers, 20, expected_clusters_length, splitting_type::BAYESIAN_INFORMATION_CRITERION);
+}
+
+
+TEST(utest_xmeans, allocation_kmax_less_real_bic_sample_simple_03) {
+    dataset start_centers = { {4.0, 1.0}, {2.0, 2.0}, {2.3, 3.9} };
+    std::vector<unsigned int> expected_clusters_length = { };
+    template_length_process_data(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_03), start_centers, 3, expected_clusters_length, splitting_type::BAYESIAN_INFORMATION_CRITERION);
+}
+
+
+TEST(utest_xmeans, allocation_one_cluster_bic_sample_simple_03) {
+    dataset start_centers = { {2.0, 2.0} };
+    std::vector<unsigned int> expected_clusters_length = { 60 };
+    template_length_process_data(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_03), start_centers, 1, expected_clusters_length, splitting_type::BAYESIAN_INFORMATION_CRITERION);
+}
+
+
 TEST(utest_xmeans, allocation_mndl_sample_simple_03) {
     dataset start_centers = { {0.2, 0.1}, {4.0, 1.0}, {2.0, 2.0}, {2.3, 3.9} };
     std::vector<unsigned int> expected_clusters_length = {10, 10, 10, 30};