-
Notifications
You must be signed in to change notification settings - Fork 250
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#667: PAM BUILD algorithm implementation (C++ version).
- Loading branch information
Showing
37 changed files
with
812 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#pragma once | ||
|
||
|
||
namespace pyclustering { | ||
|
||
namespace clst { | ||
|
||
/*! | ||
@brief Defines data representation (points, distance matrix) that is used for processing. | ||
*/ | ||
enum class data_t { | ||
POINTS, | ||
DISTANCE_MATRIX | ||
}; | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
/*! | ||
@authors Andrei Novikov (pyclustering@yandex.ru) | ||
@date 2014-2020 | ||
@copyright BSD-3-Clause | ||
*/ | ||
|
||
|
||
#pragma once | ||
|
||
|
||
#include <functional> | ||
#include <vector> | ||
|
||
#include <pyclustering/cluster/data_type.hpp> | ||
|
||
#include <pyclustering/utils/metric.hpp> | ||
|
||
|
||
using namespace pyclustering::utils::metric; | ||
|
||
|
||
namespace pyclustering { | ||
|
||
namespace clst { | ||
|
||
|
||
using medoids = std::vector<std::size_t>; | ||
|
||
|
||
/* | ||
@brief PAM BUILD algorithm is designed to find optimal initial medoids for K-Medoids algorithms family, like PAM. | ||
@details The initialization procedure chooses `k` times the point which yields the smallest distance sum of total | ||
deviation. Complexity of the algorithm is \f$O\left ( n^{2}k \right )\f$, where `n` is the amount | ||
of points and `k` is the amount of initial medoids to generate. | ||
Implementation based on paper @cite inproceedings::cluster::kmedoids::1. | ||
There is an example where PAM BUILD algorithm is used to generate initial medoids for sample `Tetra` where four clusters are expected. | ||
@code | ||
// Input data to find initial medoids. | ||
dataset data = { {3.522979, 5.487981}, {3.768699, 5.364477}, {3.423602, 5.4199}, {3.803905, 5.389491}, {3.93669, 5.663041}, {6.968136, 7.755556}, {6.750795, 7.269541}, {6.593196, 7.850364}, {6.978178, 7.60985}, {6.554487, 7.498119} }; | ||
// Amount of medoids to initialize. | ||
std::size_t amount_medoids = 2; | ||
// Initialize medoids using PAM BUILD algorithm. | ||
medoids initial_medoids; | ||
pam_build(amount_medoids).initialize(data, initial_medoids); | ||
// Display initial medoids. | ||
std::cout << "Initial medoids: [ " << std::endl; | ||
for (auto medoid : initial_medoids) { | ||
std::cout << medoid << " "; | ||
} | ||
std::cout << "]" << std::endl; | ||
@endcode | ||
The output of the code above: | ||
@code | ||
Initial medoids: [ 4 8 ] | ||
@endcode | ||
PAM BUILD algorithm provides much more optimal initial medoids in average than kmeans++ based approach and as a | ||
result it reduces amount of iterations that is needed for K-Medoids algorithms family to extract clusters. There is | ||
an illustration where initial medoids are obtained for various samples (initial medoids are marked by blue starts). | ||
@image html pam_build_initial_medoids.png "Fig. 1. Initial medoids for various samples generated by PAM BUILD algorithm." | ||
*/ | ||
class pam_build { | ||
private: | ||
using metric = distance_functor<std::vector<double>>; | ||
|
||
using distance_calculator = std::function<double(const std::size_t, const std::size_t)>; | ||
|
||
private: | ||
std::size_t m_amount = 0; | ||
|
||
distance_metric<point> m_metric = distance_metric_factory<point>::euclidean_square(); | ||
mutable distance_calculator m_calculator; | ||
|
||
mutable std::vector<double> m_distance_closest_medoid; | ||
mutable medoids * m_medoids_ptr = nullptr; | ||
mutable dataset const * m_data_ptr = nullptr; | ||
|
||
public: | ||
/* | ||
@brief Default constructor to create PAM BUILD algorithm. | ||
*/ | ||
pam_build() = default; | ||
|
||
/* | ||
@brief Constructor of PAM BUILD algorithm. | ||
@param[in] p_amount: amount of medoids that should be initialized. | ||
*/ | ||
pam_build(const std::size_t p_amount); | ||
|
||
/* | ||
@brief Constructor of PAM BUILD algorithm. | ||
@param[in] p_amount: amount of medoids that should be initialized. | ||
@param[in] p_metric: metric for distance calculation between points. | ||
*/ | ||
pam_build(const std::size_t p_amount, const metric & p_metric); | ||
|
||
/* | ||
@brief Destructor of PAM BUILD algorithm. | ||
*/ | ||
~pam_build() = default; | ||
|
||
public: | ||
/* | ||
@brief Performs center initialization process in line algorithm configuration. | ||
@param[in] p_data: data points for that medoids are calculated. | ||
@param[out] p_medoids: initialized medoids for the specified data. | ||
*/ | ||
void initialize(const dataset & p_data, const medoids & p_medoids) const; | ||
|
||
/* | ||
@brief Performs center initialization process in line algorithm configuration. | ||
@param[in] p_data: data for that medoids are calculated. | ||
@param[in] p_type: data type of `p_data` that is processed by this method (`POINTS`, `DISTANCE_MATRIX`). | ||
@param[out] p_medoids: initialized medoids for the specified data. | ||
*/ | ||
void initialize(const dataset & p_data, const data_t p_type, const medoids & p_medoids) const; | ||
|
||
private: | ||
void calculate_first_medoid() const; | ||
|
||
void calculate_next_medoids() const; | ||
|
||
pam_build::distance_calculator create_distance_calculator(const data_t p_type) const; | ||
}; | ||
|
||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.