#667: PAM BUILD algorithm implementation (C++ version).

annoviko · Feb 9, 2021 · c7e56a6 · c7e56a6
1 parent feb5ce7
commit c7e56a6
Show file tree

Hide file tree

Showing 37 changed files with 812 additions and 86 deletions.
diff --git a/ccore/include/pyclustering/cluster/data_type.hpp b/ccore/include/pyclustering/cluster/data_type.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+
+namespace pyclustering {
+
+namespace clst {
+
+/*!
+
+@brief    Defines data representation (points, distance matrix) that is used for processing.
+
+*/
+enum class data_t {
+    POINTS,
+    DISTANCE_MATRIX
+};
+
+}
+
+}
diff --git a/ccore/include/pyclustering/cluster/dbscan.hpp b/ccore/include/pyclustering/cluster/dbscan.hpp
@@ -14,6 +14,7 @@
 
 #include <pyclustering/container/kdtree_balanced.hpp>
 
+#include <pyclustering/cluster/data_type.hpp>
 #include <pyclustering/cluster/dbscan_data.hpp>
 
 
@@ -22,17 +23,6 @@ namespace pyclustering {
 namespace clst {
 
 
-/*!
-
-@brief Defines types that are used for input data representation.
-
-*/
-enum class dbscan_data_t {
-    POINTS,             /**< Data is represented by a container of points. */
-    DISTANCE_MATRIX     /**< Data is represented by a distance matrix between points. */
-};
-
-
 /*!
 
 @class    dbscan dbscan.hpp pyclustering/cluster/dbscan.hpp
@@ -57,7 +47,7 @@ class dbscan {
 
     size_t                     m_neighbors       = 0;
 
-    dbscan_data_t              m_type            = dbscan_data_t::POINTS;
+    data_t                     m_type            = data_t::POINTS;
 
     container::kdtree_balanced m_kdtree = container::kdtree_balanced();
 
@@ -108,7 +98,7 @@ class dbscan {
     @param[out] p_result: clustering result of an input data.
     
     */
-    void process(const dataset & p_data, const dbscan_data_t p_type, dbscan_data & p_result);
+    void process(const dataset & p_data, const data_t p_type, dbscan_data & p_result);
 
 private:
     /*!

diff --git a/ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp b/ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp
@@ -55,7 +55,7 @@ class kmeans_plus_plus : public center_initializer {
      * @brief Metric that is used for distance calculation between two points.
      *
      */
-    using metric = distance_functor< std::vector<double> >;
+    using metric = distance_functor<std::vector<double>>;
 
 private:
     using index_set = std::unordered_set<std::size_t>;

diff --git a/ccore/include/pyclustering/cluster/kmedoids.hpp b/ccore/include/pyclustering/cluster/kmedoids.hpp
@@ -11,6 +11,7 @@
 
 #include <memory>
 
+#include <pyclustering/cluster/data_type.hpp>
 #include <pyclustering/cluster/kmedoids_data.hpp>
 
 #include <pyclustering/utils/metric.hpp>
@@ -24,17 +25,6 @@ namespace pyclustering {
 namespace clst {
 
 
-/*!
-
-@brief    Defines data representation (point, distance matrix) that is used for processing by K-Medoids algorithm.
-
-*/
-enum class kmedoids_data_t {
-    POINTS,
-    DISTANCE_MATRIX
-};
-
-
 /*!
 
 @brief    Represents K-Medoids clustering algorithm (PAM algorithm) for cluster analysis.
@@ -145,7 +135,7 @@ class kmedoids {
     @param[out] p_result: clustering result of an input data.
     
     */
-    void process(const dataset & p_data, const kmedoids_data_t p_type, kmedoids_data & p_result);
+    void process(const dataset & p_data, const data_t p_type, kmedoids_data & p_result);
 
 private:
     /*!
@@ -164,7 +154,7 @@ class kmedoids {
     @return   Distance calculator.
 
     */
-    distance_calculator create_distance_calculator(const kmedoids_data_t p_type);
+    distance_calculator create_distance_calculator(const data_t p_type);
 
     /*!
     

diff --git a/ccore/include/pyclustering/cluster/optics.hpp b/ccore/include/pyclustering/cluster/optics.hpp
@@ -15,6 +15,7 @@
 
 #include <pyclustering/container/kdtree_balanced.hpp>
 
+#include <pyclustering/cluster/data_type.hpp>
 #include <pyclustering/cluster/optics_data.hpp>
 #include <pyclustering/cluster/optics_descriptor.hpp>
 
@@ -24,17 +25,6 @@ namespace pyclustering {
 namespace clst {
 
 
-/*!
-
-@brief Enumeration of input data type that are processed by OPTICS algorithm.
-
-*/
-enum class optics_data_t {
-    POINTS,             /**< Data is represented by a container of points. */
-    DISTANCE_MATRIX     /**< Data is represented by a distance matrix between points. */
-};
-
-
 /*!
 
 @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
@@ -83,7 +73,7 @@ class optics {
 
     std::size_t         m_amount_clusters   = 0;
 
-    optics_data_t       m_type              = optics_data_t::POINTS;
+    data_t              m_type              = data_t::POINTS;
 
     container::kdtree_balanced      m_kdtree            = container::kdtree_balanced();
 
@@ -166,7 +156,7 @@ class optics {
                  cluster-ordering, noise and proper connectivity radius).
 
     */
-    void process(const dataset & p_data, const optics_data_t p_type, optics_data & p_result);
+    void process(const dataset & p_data, const data_t p_type, optics_data & p_result);
 
 private:
     void initialize();

diff --git a/ccore/include/pyclustering/cluster/pam_build.hpp b/ccore/include/pyclustering/cluster/pam_build.hpp
@@ -0,0 +1,157 @@
+/*!
+
+@authors Andrei Novikov (pyclustering@yandex.ru)
+@date 2014-2020
+@copyright BSD-3-Clause
+
+*/
+
+
+#pragma once
+
+
+#include <functional>
+#include <vector>
+
+#include <pyclustering/cluster/data_type.hpp>
+
+#include <pyclustering/utils/metric.hpp>
+
+
+using namespace pyclustering::utils::metric;
+
+
+namespace pyclustering {
+
+namespace clst {
+
+
+using medoids = std::vector<std::size_t>;
+
+
+/*
+
+@brief PAM BUILD algorithm is designed to find optimal initial medoids for K-Medoids algorithms family, like PAM.
+
+@details The initialization procedure chooses `k` times the point which yields the smallest distance sum of total
+          deviation. Complexity of the algorithm is \f$O\left ( n^{2}k \right )\f$, where `n` is the amount
+          of points and `k` is the amount of initial medoids to generate.
+
+Implementation based on paper @cite inproceedings::cluster::kmedoids::1.
+
+There is an example where PAM BUILD algorithm is used to generate initial medoids for sample `Tetra` where four clusters are expected.
+@code
+    // Input data to find initial medoids.
+    dataset data = { {3.522979, 5.487981}, {3.768699, 5.364477}, {3.423602, 5.4199}, {3.803905, 5.389491}, {3.93669, 5.663041}, {6.968136, 7.755556}, {6.750795, 7.269541}, {6.593196, 7.850364}, {6.978178, 7.60985}, {6.554487, 7.498119} };
+
+    // Amount of medoids to initialize.
+    std::size_t amount_medoids = 2;
+
+    // Initialize medoids using PAM BUILD algorithm.
+    medoids initial_medoids;
+    pam_build(amount_medoids).initialize(data, initial_medoids);
+
+    // Display initial medoids.
+    std::cout << "Initial medoids: [ " << std::endl;
+    for (auto medoid : initial_medoids) {
+        std::cout << medoid << " ";
+    }
+    std::cout << "]" << std::endl;
+@endcode
+
+The output of the code above:
+@code
+    Initial medoids: [ 4 8 ]
+@endcode
+
+PAM BUILD algorithm provides much more optimal initial medoids in average than kmeans++ based approach and as a
+result it reduces amount of iterations that is needed for K-Medoids algorithms family to extract clusters. There is
+an illustration where initial medoids are obtained for various samples (initial medoids are marked by blue starts).
+
+@image html pam_build_initial_medoids.png "Fig. 1. Initial medoids for various samples generated by PAM BUILD algorithm."
+
+*/
+class pam_build {
+private:
+    using metric = distance_functor<std::vector<double>>;
+
+    using distance_calculator = std::function<double(const std::size_t, const std::size_t)>;
+
+private:
+    std::size_t                     m_amount = 0;
+
+    distance_metric<point>          m_metric = distance_metric_factory<point>::euclidean_square();
+    mutable distance_calculator     m_calculator;
+
+    mutable std::vector<double>     m_distance_closest_medoid;
+    mutable medoids *               m_medoids_ptr   = nullptr;
+    mutable dataset const *         m_data_ptr      = nullptr;
+
+public:
+    /*
+
+    @brief Default constructor to create PAM BUILD algorithm.
+
+    */
+    pam_build() = default;
+
+    /*
+    
+    @brief    Constructor of PAM BUILD algorithm.
+
+    @param[in] p_amount: amount of medoids that should be initialized.
+
+    */
+    pam_build(const std::size_t p_amount);
+
+    /*
+
+    @brief    Constructor of PAM BUILD algorithm.
+
+    @param[in] p_amount: amount of medoids that should be initialized.
+    @param[in] p_metric: metric for distance calculation between points.
+
+    */
+    pam_build(const std::size_t p_amount, const metric & p_metric);
+
+    /*
+
+    @brief    Destructor of PAM BUILD algorithm.
+
+    */
+    ~pam_build() = default;
+
+public:
+    /*
+
+    @brief    Performs center initialization process in line algorithm configuration.
+
+    @param[in]  p_data: data points for that medoids are calculated.
+    @param[out] p_medoids: initialized medoids for the specified data.
+
+    */
+    void initialize(const dataset & p_data, const medoids & p_medoids) const;
+
+    /*
+
+    @brief    Performs center initialization process in line algorithm configuration.
+
+    @param[in]  p_data: data for that medoids are calculated.
+    @param[in]  p_type: data type of `p_data` that is processed by this method (`POINTS`, `DISTANCE_MATRIX`).
+    @param[out] p_medoids: initialized medoids for the specified data.
+
+    */
+    void initialize(const dataset & p_data, const data_t p_type, const medoids & p_medoids) const;
+
+private:
+    void calculate_first_medoid() const;
+
+    void calculate_next_medoids() const;
+
+    pam_build::distance_calculator create_distance_calculator(const data_t p_type) const;
+};
+
+
+}
+
+}
diff --git a/ccore/include/pyclustering/cluster/silhouette.hpp b/ccore/include/pyclustering/cluster/silhouette.hpp
@@ -10,6 +10,7 @@
 
 
 #include <pyclustering/cluster/cluster_data.hpp>
+#include <pyclustering/cluster/data_type.hpp>
 #include <pyclustering/cluster/silhouette_data.hpp>
 
 #include <pyclustering/definitions.hpp>
@@ -25,17 +26,6 @@ namespace pyclustering {
 namespace clst {
 
 
-/*!
-
-@brief Defines types that are used for input data representation.
-
-*/
-enum class silhouette_data_t {
-    POINTS,
-    DISTANCE_MATRIX
-};
-
-
 /*!
 
 @class  silhouette silhouette.hpp pyclustering/cluster/silhouette.hpp
@@ -111,7 +101,7 @@ class silhouette {
     const cluster_sequence *  m_clusters  = nullptr;  /* temporary object, exists during processing */
     silhouette_data *         m_result    = nullptr;  /* temporary object, exists during processing */
 
-    silhouette_data_t         m_type      = silhouette_data_t::POINTS;
+    data_t                    m_type      = data_t::POINTS;
 
     distance_metric<point>    m_metric    = distance_metric_factory<point>::euclidean_square();
 
@@ -176,7 +166,7 @@ class silhouette {
     @param[out] p_result: silhouette input data processing result.
 
     */
-    void process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result);
+    void process(const dataset & p_data, const cluster_sequence & p_clusters, const data_t & p_type, silhouette_data & p_result);
 
 private:
     double calculate_score(const std::size_t p_index_point, const std::size_t p_index_cluster) const;