#602: G-Means: Setting maximum number of clusters like for X-Means.

annoviko · May 28, 2020 · ab1cf9c · ab1cf9c
1 parent dcb6dde
commit ab1cf9c
Show file tree

Hide file tree

Showing 17 changed files with 328 additions and 82 deletions.
diff --git a/CHANGES b/CHANGES
@@ -5,6 +5,7 @@ CHANGE NOTES FOR 0.10.0 (STARTED Jan 24, 2020), (RELEASED: Dev -)
 ------------------------------------------------------------------------
 
 GENERAL CHANGES:
+
 - Introduced parameter `k_max` to G-Means algorithm to use it as an optional stop condition for the algorithm (Python: `pyclustering.cluster.gmeans`; C++: `pyclustering::clst::gmeans`).
   See: https://github.com/annoviko/pyclustering/issues/602
 

diff --git a/ccore/include/pyclustering/cluster/gmeans.hpp b/ccore/include/pyclustering/cluster/gmeans.hpp
@@ -106,15 +106,15 @@ class gmeans {
     using projection = std::vector<double>;
 
 public:
-    const static long long          IGNORE_KMAX;            /**< Defines value that indicates to the G-Means algorithm to ignore K maximum value. */
+    const static long long          IGNORE_KMAX;                /**< Defines value that means to ignore K maximum value. */
 
-    const static std::size_t        DEFAULT_AMOUNT_CENTERS; /**< Defaule value of amount of initial K - the value from that search is started. */
+    const static std::size_t        DEFAULT_AMOUNT_CENTERS;     /**< Defaule value of amount of initial K - the value from that the search procedure is started. */
 
-    const static double             DEFAULT_TOLERANCE;      /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */
+    const static double             DEFAULT_TOLERANCE;          /**< Default value of the tolerance (stop condition): if the maximum value of cluster changes is less than tolerance then the algorithm stops processing. */
 
-    const static std::size_t        DEFAULT_REPEAT;         /**< Default value that defines how many times K-Means should be run to improve parameters. */
+    const static std::size_t        DEFAULT_REPEAT;             /**< Default value that defines how many times K-Means should be run to improve parameters. */
 
-    const static std::size_t        DEFAULT_CANDIDATES;     /**< Default value of amount of candidates to consider by K-Means++ to initialize initial centers for K-Means on each iteration. */
+    const static std::size_t        DEFAULT_CANDIDATES;         /**< Default value of amount of candidates to consider by K-Means++ to initialize initial centers for K-Means on each iteration. */
 
 private:
     std::size_t             m_amount                = DEFAULT_AMOUNT_CENTERS;
@@ -125,6 +125,8 @@ class gmeans {
 
     long long               m_kmax                  = IGNORE_KMAX;
 
+    long long               m_random_state          = RANDOM_STATE_CURRENT_TIME;
+
     gmeans_data             * m_ptr_result          = nullptr;      /* temporary pointer to output result */
 
     const dataset           * m_ptr_data            = nullptr;      /* used only during processing */
@@ -149,13 +151,15 @@ class gmeans {
                 with larger 'repeat' values suggesting higher probability of finding global optimum.
     @param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
                 condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
-                amount of clusters is not restricted (`k_max` is -1).
+                amount of clusters is not restricted (`k_max` is `IGNORE_KMAX`).
+    @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
     
     */
     gmeans(const std::size_t p_k_initial, 
            const double p_tolerance = DEFAULT_TOLERANCE,
            const std::size_t p_repeat = DEFAULT_REPEAT,
-           const long long p_kmax = IGNORE_KMAX);
+           const long long p_kmax = IGNORE_KMAX,
+           const long long p_random_state = RANDOM_STATE_CURRENT_TIME);
 
     /*!
     

diff --git a/ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp b/ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp
@@ -25,6 +25,7 @@
 #pragma once
 
 
+#include <random>
 #include <unordered_set>
 
 #include <pyclustering/definitions.hpp>
@@ -80,9 +81,11 @@ class kmeans_plus_plus : public center_initializer {
     using store_result = std::function<void(center_description &)>;
 
 private:
-    std::size_t         m_amount        = 0;
-    std::size_t         m_candidates    = 0;
-    metric              m_dist_func;
+    std::size_t             m_amount        = 0;
+    std::size_t             m_candidates    = 0;
+    metric                  m_dist_func;
+    long long               m_random_state  = RANDOM_STATE_CURRENT_TIME;
+    mutable std::mt19937    m_generator;
 
     /* temporal members that are used only during initialization */
     mutable dataset const *           m_data_ptr      = nullptr;
@@ -107,11 +110,12 @@ class kmeans_plus_plus : public center_initializer {
     * @param[in] p_candidates: amount of candidates that are considered to find the best center, if
     *             the farthest candidate is required (with highest probability) than static constant
     *             FARTHEST_CENTER_CANDIDATE can be specified.
+    * @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
     *
     * @see FARTHEST_CENTER_CANDIDATE
     *
     */
-    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates = 1) noexcept;
+    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates = 1, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;
 
     /**
     *
@@ -123,11 +127,12 @@ class kmeans_plus_plus : public center_initializer {
     *             the farthest candidate is required (with highest probability) than static constant
     *             FARTHEST_CENTER_CANDIDATE can be specified.
     * @param[in] p_metric: metric for distance calculation between points.
+    * @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
     *
     * @see FARTHEST_CENTER_CANDIDATE
     *
     */
-    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric) noexcept;
+    kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;
 
     /**
      *
@@ -185,6 +190,13 @@ class kmeans_plus_plus : public center_initializer {
     void initialize(const dataset & p_data, index_sequence & p_center_indexes) const;
 
 private:
+    /**
+    *
+    * @brief    Assigns seed to the random generator that is used by the algorithm.
+    *
+    */
+    void initialize_random_generator();
+
     /**
     *
     * @brief    Performs center initialization process in line algorithm configuration.

diff --git a/ccore/include/pyclustering/definitions.hpp b/ccore/include/pyclustering/definitions.hpp
@@ -49,6 +49,9 @@
 namespace pyclustering {
 
 
+constexpr long long RANDOM_STATE_CURRENT_TIME = -1;     /**< Defines value of the random state that indicates to the algorithm to use current system time as a seed for random functionality. */
+
+
 /*!
 
 @brief   Defines a patten that consists of features that describe this pattern.

diff --git a/ccore/include/pyclustering/interface/gmeans_interface.h b/ccore/include/pyclustering/interface/gmeans_interface.h
@@ -42,25 +42,27 @@ enum gmeans_package_indexer {
 };
 
 
-/**
- *
- * @brief   Clustering algorithm G-Means returns allocated clusters.
- * @details Caller should destroy returned result in 'pyclustering_package'.
- *
- * @param[in] p_sample: input data for clustering.
- * @param[in] p_amount: initial amount of centers.
- * @param[in] p_tolerance: stop condition - when changes of medians are less then tolerance value.
- * @param[in] p_repeat: how many times K-Means should be run to improve parameters, with larger 'repeat' 
- *             values suggesting higher probability of finding global optimum.
- * @param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
-               condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
-               amount of clusters is not restricted (`k_max` is -1).
- *
- * @return  Returns result of clustering - array of allocated clusters.
- *
- */
+/*!
+
+@brief   Clustering algorithm G-Means returns allocated clusters.
+@details Caller should destroy returned result in 'pyclustering_package'.
+
+@param[in] p_sample: input data for clustering.
+@param[in] p_amount: initial amount of centers.
+@param[in] p_tolerance: stop condition - when changes of medians are less then tolerance value.
+@param[in] p_repeat: how many times K-Means should be run to improve parameters, with larger 'repeat' 
+            values suggesting higher probability of finding global optimum.
+@param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
+            condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
+            amount of clusters is not restricted (`k_max` is -1).
+@param[in] p_random_state: seed for random state (by default is `None`, current system time is used).
+
+@return  Returns result of clustering - array of allocated clusters.
+
+*/
 extern "C" DECLARATION pyclustering_package * gmeans_algorithm(const pyclustering_package * const p_sample, 
                                                                const std::size_t p_amount, 
                                                                const double p_tolerance,
                                                                const std::size_t p_repeat,
-                                                               const long long p_kmax);
+                                                               const long long p_kmax,
+                                                               const long long p_random_state);
diff --git a/ccore/src/cluster/gmeans.cpp b/ccore/src/cluster/gmeans.cpp
@@ -47,22 +47,23 @@ namespace pyclustering {
 namespace clst {
 
 
-const long long          gmeans::IGNORE_KMAX            = -1;
+const long long          gmeans::IGNORE_KMAX                = -1;
 
-const std::size_t        gmeans::DEFAULT_AMOUNT_CENTERS = 1;
+const std::size_t        gmeans::DEFAULT_AMOUNT_CENTERS     = 1;
 
-const double             gmeans::DEFAULT_TOLERANCE      = 0.001;
+const double             gmeans::DEFAULT_TOLERANCE          = 0.001;
 
-const std::size_t        gmeans::DEFAULT_REPEAT         = 3;
+const std::size_t        gmeans::DEFAULT_REPEAT             = 3;
 
-const std::size_t        gmeans::DEFAULT_CANDIDATES     = 3;
+const std::size_t        gmeans::DEFAULT_CANDIDATES         = 3;
 
 
-gmeans::gmeans(const std::size_t p_k_initial, const double p_tolerance, const std::size_t p_repeat, const long long p_kmax) :
+gmeans::gmeans(const std::size_t p_k_initial, const double p_tolerance, const std::size_t p_repeat, const long long p_kmax, const long long p_random_state) :
     m_amount(p_k_initial),
     m_tolerance(p_tolerance),
     m_repeat(p_repeat),
     m_kmax(p_kmax),
+    m_random_state(p_random_state),
     m_ptr_result(nullptr),
     m_ptr_data(nullptr)
 { }
@@ -107,7 +108,7 @@ void gmeans::search_optimal_parameters(const dataset & p_data, const std::size_t
 
     for (std::size_t i = 0; i < m_repeat; i++) {
         dataset initial_centers;
-        kmeans_plus_plus(p_amount, get_amount_candidates(p_data)).initialize(p_data, initial_centers);
+        kmeans_plus_plus(p_amount, get_amount_candidates(p_data), m_random_state).initialize(p_data, initial_centers);
 
         kmeans_data result;
         kmeans(initial_centers, m_tolerance).process(p_data, result);
@@ -130,16 +131,18 @@ void gmeans::search_optimal_parameters(const dataset & p_data, const std::size_t
 
 void gmeans::statistical_optimization() {
     dataset centers;
+    long long potential_amount_clusters = static_cast<long long>(m_ptr_result->clusters().size());
     for (std::size_t i = 0; i < m_ptr_result->clusters().size(); i++) {
         dataset new_centers;
         split_and_search_optimal(m_ptr_result->clusters().at(i), new_centers);
 
-        if (new_centers.empty()) {
+        if (new_centers.empty() || ((m_kmax != IGNORE_KMAX) && (potential_amount_clusters >= m_kmax))) {
             centers.push_back(std::move(m_ptr_result->centers().at(i)));
         }
         else {
             centers.push_back(std::move(new_centers[0]));
             centers.push_back(std::move(new_centers[1]));
+            potential_amount_clusters++;
         }
     }
 

diff --git a/ccore/src/cluster/kmeans_plus_plus.cpp b/ccore/src/cluster/kmeans_plus_plus.cpp
@@ -28,7 +28,6 @@
 #include <exception>
 #include <limits>
 #include <numeric>
-#include <random>
 #include <string>
 
 
@@ -41,20 +40,38 @@ const std::size_t kmeans_plus_plus::FARTHEST_CENTER_CANDIDATE = std::numeric_lim
 const std::size_t kmeans_plus_plus::INVALID_INDEX = std::numeric_limits<std::size_t>::max();
 
 
-kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates) noexcept :
+kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const long long p_random_state) noexcept :
         m_amount(p_amount),
         m_candidates(p_candidates),
         m_dist_func([](const point &p1, const point &p2) {
             return euclidean_distance_square(p1, p2);
-        })
-{ }
+        }),
+        m_random_state(p_random_state),
+        m_generator(std::random_device()())
+{
+    initialize_random_generator();
+}
 
 
-kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric) noexcept :
+kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric, const long long p_random_state) noexcept :
         m_amount(p_amount),
         m_candidates(p_candidates),
-        m_dist_func(p_metric)
-{ }
+        m_dist_func(p_metric),
+        m_random_state(p_random_state),
+        m_generator(std::random_device()())
+{
+    initialize_random_generator();
+}
+
+
+void kmeans_plus_plus::initialize_random_generator() {
+    if (m_random_state == RANDOM_STATE_CURRENT_TIME) {
+        m_generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));
+    }
+    else {
+        m_generator.seed(static_cast<unsigned int>(m_random_state));
+    }
+}
 
 
 void kmeans_plus_plus::initialize(const dataset & p_data, dataset & p_centers) const {
@@ -151,14 +168,9 @@ void kmeans_plus_plus::free_temporal_params() const {
 kmeans_plus_plus::center_description kmeans_plus_plus::get_first_center() const {
     std::size_t length = m_indexes_ptr->empty() ? m_data_ptr->size() : m_indexes_ptr->size();
 
-    std::random_device random_device;
-
-    std::mt19937 generator(random_device());
-    generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));
-
     std::uniform_int_distribution<std::size_t> distribution(0, length - 1);
 
-    std::size_t index = distribution(generator);
+    std::size_t index = distribution(m_generator);
     const auto & center = m_indexes_ptr->empty() ? (*m_data_ptr)[index] : (*m_data_ptr)[ (*m_indexes_ptr)[index] ];
 
     return std::make_tuple(center, index);
@@ -238,15 +250,12 @@ void kmeans_plus_plus::calculate_probabilities(const std::vector<double> & p_dis
 
 
 std::size_t kmeans_plus_plus::get_probable_center(const std::vector<double> & p_distances, const std::vector<double> & p_probabilities) const {
-    std::default_random_engine generator;
-    generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));
-
     std::uniform_real_distribution<double> distribution(0.0, 1.0);
 
     std::size_t best_index_candidate = 0;
     for (std::size_t i = 0; i < m_candidates; i++) {
         std::size_t current_index_candidate = kmeans_plus_plus::INVALID_INDEX;
-        double candidate_probability = distribution(generator);
+        double candidate_probability = distribution(m_generator);
         for (std::size_t j = 0; j < p_probabilities.size(); j++) {
             if (candidate_probability < p_probabilities[j]) {
                 current_index_candidate = j;

diff --git a/ccore/src/interface/gmeans_interface.cpp b/ccore/src/interface/gmeans_interface.cpp
@@ -29,12 +29,13 @@ pyclustering_package * gmeans_algorithm(const pyclustering_package * const p_sam
                                         const std::size_t p_amount, 
                                         const double p_tolerance,
                                         const std::size_t p_repeat,
-                                        const long long p_kmax)
+                                        const long long p_kmax,
+                                        const long long p_random_state)
 {
     pyclustering::dataset data;
     p_sample->extract(data);
 
-    pyclustering::clst::gmeans algorithm(p_amount, p_tolerance, p_repeat, p_kmax);
+    pyclustering::clst::gmeans algorithm(p_amount, p_tolerance, p_repeat, p_kmax, p_random_state);
 
     pyclustering::clst::gmeans_data output_result;
     algorithm.process(data, output_result);