Skip to content

Commit

Permalink
#667: PAM BUILD algorithm implementation (C++ version).
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Feb 9, 2021
1 parent feb5ce7 commit c7e56a6
Show file tree
Hide file tree
Showing 37 changed files with 812 additions and 86 deletions.
20 changes: 20 additions & 0 deletions ccore/include/pyclustering/cluster/data_type.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once


namespace pyclustering {

namespace clst {

/*!
@brief Defines data representation (points, distance matrix) that is used for processing.
*/
enum class data_t {
POINTS,
DISTANCE_MATRIX
};

}

}
16 changes: 3 additions & 13 deletions ccore/include/pyclustering/cluster/dbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <pyclustering/container/kdtree_balanced.hpp>

#include <pyclustering/cluster/data_type.hpp>
#include <pyclustering/cluster/dbscan_data.hpp>


Expand All @@ -22,17 +23,6 @@ namespace pyclustering {
namespace clst {


/*!
@brief Defines types that are used for input data representation.
*/
enum class dbscan_data_t {
POINTS, /**< Data is represented by a container of points. */
DISTANCE_MATRIX /**< Data is represented by a distance matrix between points. */
};


/*!
@class dbscan dbscan.hpp pyclustering/cluster/dbscan.hpp
Expand All @@ -57,7 +47,7 @@ class dbscan {

size_t m_neighbors = 0;

dbscan_data_t m_type = dbscan_data_t::POINTS;
data_t m_type = data_t::POINTS;

container::kdtree_balanced m_kdtree = container::kdtree_balanced();

Expand Down Expand Up @@ -108,7 +98,7 @@ class dbscan {
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, const dbscan_data_t p_type, dbscan_data & p_result);
void process(const dataset & p_data, const data_t p_type, dbscan_data & p_result);

private:
/*!
Expand Down
2 changes: 1 addition & 1 deletion ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class kmeans_plus_plus : public center_initializer {
* @brief Metric that is used for distance calculation between two points.
*
*/
using metric = distance_functor< std::vector<double> >;
using metric = distance_functor<std::vector<double>>;

private:
using index_set = std::unordered_set<std::size_t>;
Expand Down
16 changes: 3 additions & 13 deletions ccore/include/pyclustering/cluster/kmedoids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <memory>

#include <pyclustering/cluster/data_type.hpp>
#include <pyclustering/cluster/kmedoids_data.hpp>

#include <pyclustering/utils/metric.hpp>
Expand All @@ -24,17 +25,6 @@ namespace pyclustering {
namespace clst {


/*!
@brief Defines data representation (point, distance matrix) that is used for processing by K-Medoids algorithm.
*/
enum class kmedoids_data_t {
POINTS,
DISTANCE_MATRIX
};


/*!
@brief Represents K-Medoids clustering algorithm (PAM algorithm) for cluster analysis.
Expand Down Expand Up @@ -145,7 +135,7 @@ class kmedoids {
@param[out] p_result: clustering result of an input data.
*/
void process(const dataset & p_data, const kmedoids_data_t p_type, kmedoids_data & p_result);
void process(const dataset & p_data, const data_t p_type, kmedoids_data & p_result);

private:
/*!
Expand All @@ -164,7 +154,7 @@ class kmedoids {
@return Distance calculator.
*/
distance_calculator create_distance_calculator(const kmedoids_data_t p_type);
distance_calculator create_distance_calculator(const data_t p_type);

/*!
Expand Down
16 changes: 3 additions & 13 deletions ccore/include/pyclustering/cluster/optics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <pyclustering/container/kdtree_balanced.hpp>

#include <pyclustering/cluster/data_type.hpp>
#include <pyclustering/cluster/optics_data.hpp>
#include <pyclustering/cluster/optics_descriptor.hpp>

Expand All @@ -24,17 +25,6 @@ namespace pyclustering {
namespace clst {


/*!
@brief Enumeration of input data type that are processed by OPTICS algorithm.
*/
enum class optics_data_t {
POINTS, /**< Data is represented by a container of points. */
DISTANCE_MATRIX /**< Data is represented by a distance matrix between points. */
};


/*!
@brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
Expand Down Expand Up @@ -83,7 +73,7 @@ class optics {

std::size_t m_amount_clusters = 0;

optics_data_t m_type = optics_data_t::POINTS;
data_t m_type = data_t::POINTS;

container::kdtree_balanced m_kdtree = container::kdtree_balanced();

Expand Down Expand Up @@ -166,7 +156,7 @@ class optics {
cluster-ordering, noise and proper connectivity radius).
*/
void process(const dataset & p_data, const optics_data_t p_type, optics_data & p_result);
void process(const dataset & p_data, const data_t p_type, optics_data & p_result);

private:
void initialize();
Expand Down
157 changes: 157 additions & 0 deletions ccore/include/pyclustering/cluster/pam_build.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*!
@authors Andrei Novikov (pyclustering@yandex.ru)
@date 2014-2020
@copyright BSD-3-Clause
*/


#pragma once


#include <functional>
#include <vector>

#include <pyclustering/cluster/data_type.hpp>

#include <pyclustering/utils/metric.hpp>


using namespace pyclustering::utils::metric;


namespace pyclustering {

namespace clst {


using medoids = std::vector<std::size_t>;


/*
@brief PAM BUILD algorithm is designed to find optimal initial medoids for K-Medoids algorithms family, like PAM.
@details The initialization procedure chooses `k` times the point which yields the smallest distance sum of total
deviation. Complexity of the algorithm is \f$O\left ( n^{2}k \right )\f$, where `n` is the amount
of points and `k` is the amount of initial medoids to generate.
Implementation based on paper @cite inproceedings::cluster::kmedoids::1.
There is an example where PAM BUILD algorithm is used to generate initial medoids for sample `Tetra` where four clusters are expected.
@code
// Input data to find initial medoids.
dataset data = { {3.522979, 5.487981}, {3.768699, 5.364477}, {3.423602, 5.4199}, {3.803905, 5.389491}, {3.93669, 5.663041}, {6.968136, 7.755556}, {6.750795, 7.269541}, {6.593196, 7.850364}, {6.978178, 7.60985}, {6.554487, 7.498119} };
// Amount of medoids to initialize.
std::size_t amount_medoids = 2;
// Initialize medoids using PAM BUILD algorithm.
medoids initial_medoids;
pam_build(amount_medoids).initialize(data, initial_medoids);
// Display initial medoids.
std::cout << "Initial medoids: [ " << std::endl;
for (auto medoid : initial_medoids) {
std::cout << medoid << " ";
}
std::cout << "]" << std::endl;
@endcode
The output of the code above:
@code
Initial medoids: [ 4 8 ]
@endcode
PAM BUILD algorithm provides much more optimal initial medoids in average than kmeans++ based approach and as a
result it reduces amount of iterations that is needed for K-Medoids algorithms family to extract clusters. There is
an illustration where initial medoids are obtained for various samples (initial medoids are marked by blue starts).
@image html pam_build_initial_medoids.png "Fig. 1. Initial medoids for various samples generated by PAM BUILD algorithm."
*/
class pam_build {
private:
using metric = distance_functor<std::vector<double>>;

using distance_calculator = std::function<double(const std::size_t, const std::size_t)>;

private:
std::size_t m_amount = 0;

distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square();
mutable distance_calculator m_calculator;

mutable std::vector<double> m_distance_closest_medoid;
mutable medoids * m_medoids_ptr = nullptr;
mutable dataset const * m_data_ptr = nullptr;

public:
/*
@brief Default constructor to create PAM BUILD algorithm.
*/
pam_build() = default;

/*
@brief Constructor of PAM BUILD algorithm.
@param[in] p_amount: amount of medoids that should be initialized.
*/
pam_build(const std::size_t p_amount);

/*
@brief Constructor of PAM BUILD algorithm.
@param[in] p_amount: amount of medoids that should be initialized.
@param[in] p_metric: metric for distance calculation between points.
*/
pam_build(const std::size_t p_amount, const metric & p_metric);

/*
@brief Destructor of PAM BUILD algorithm.
*/
~pam_build() = default;

public:
/*
@brief Performs center initialization process in line algorithm configuration.
@param[in] p_data: data points for that medoids are calculated.
@param[out] p_medoids: initialized medoids for the specified data.
*/
void initialize(const dataset & p_data, const medoids & p_medoids) const;

/*
@brief Performs center initialization process in line algorithm configuration.
@param[in] p_data: data for that medoids are calculated.
@param[in] p_type: data type of `p_data` that is processed by this method (`POINTS`, `DISTANCE_MATRIX`).
@param[out] p_medoids: initialized medoids for the specified data.
*/
void initialize(const dataset & p_data, const data_t p_type, const medoids & p_medoids) const;

private:
void calculate_first_medoid() const;

void calculate_next_medoids() const;

pam_build::distance_calculator create_distance_calculator(const data_t p_type) const;
};


}

}
16 changes: 3 additions & 13 deletions ccore/include/pyclustering/cluster/silhouette.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


#include <pyclustering/cluster/cluster_data.hpp>
#include <pyclustering/cluster/data_type.hpp>
#include <pyclustering/cluster/silhouette_data.hpp>

#include <pyclustering/definitions.hpp>
Expand All @@ -25,17 +26,6 @@ namespace pyclustering {
namespace clst {


/*!
@brief Defines types that are used for input data representation.
*/
enum class silhouette_data_t {
POINTS,
DISTANCE_MATRIX
};


/*!
@class silhouette silhouette.hpp pyclustering/cluster/silhouette.hpp
Expand Down Expand Up @@ -111,7 +101,7 @@ class silhouette {
const cluster_sequence * m_clusters = nullptr; /* temporary object, exists during processing */
silhouette_data * m_result = nullptr; /* temporary object, exists during processing */

silhouette_data_t m_type = silhouette_data_t::POINTS;
data_t m_type = data_t::POINTS;

distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square();

Expand Down Expand Up @@ -176,7 +166,7 @@ class silhouette {
@param[out] p_result: silhouette input data processing result.
*/
void process(const dataset & p_data, const cluster_sequence & p_clusters, const silhouette_data_t & p_type, silhouette_data & p_result);
void process(const dataset & p_data, const cluster_sequence & p_clusters, const data_t & p_type, silhouette_data & p_result);

private:
double calculate_score(const std::size_t p_index_point, const std::size_t p_index_cluster) const;
Expand Down
Loading

0 comments on commit c7e56a6

Please sign in to comment.