Skip to content

Commit

Permalink
#602: G-Means: Setting maximum number of clusters like for X-Means.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed May 28, 2020
1 parent dcb6dde commit ab1cf9c
Show file tree
Hide file tree
Showing 17 changed files with 328 additions and 82 deletions.
1 change: 1 addition & 0 deletions CHANGES
Expand Up @@ -5,6 +5,7 @@ CHANGE NOTES FOR 0.10.0 (STARTED Jan 24, 2020), (RELEASED: Dev -)
------------------------------------------------------------------------

GENERAL CHANGES:

- Introduced parameter `k_max` to G-Means algorithm to use it as an optional stop condition for the algorithm (Python: `pyclustering.cluster.gmeans`; C++: `pyclustering::clst::gmeans`).
See: https://github.com/annoviko/pyclustering/issues/602

Expand Down
18 changes: 11 additions & 7 deletions ccore/include/pyclustering/cluster/gmeans.hpp
Expand Up @@ -106,15 +106,15 @@ class gmeans {
using projection = std::vector<double>;

public:
const static long long IGNORE_KMAX; /**< Defines value that indicates to the G-Means algorithm to ignore K maximum value. */
const static long long IGNORE_KMAX; /**< Defines value that means to ignore K maximum value. */

const static std::size_t DEFAULT_AMOUNT_CENTERS; /**< Defaule value of amount of initial K - the value from that search is started. */
const static std::size_t DEFAULT_AMOUNT_CENTERS; /**< Defaule value of amount of initial K - the value from that the search procedure is started. */

const static double DEFAULT_TOLERANCE; /**< Default value of the tolerance stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing. */
const static double DEFAULT_TOLERANCE; /**< Default value of the tolerance (stop condition): if the maximum value of cluster changes is less than tolerance then the algorithm stops processing. */

const static std::size_t DEFAULT_REPEAT; /**< Default value that defines how many times K-Means should be run to improve parameters. */
const static std::size_t DEFAULT_REPEAT; /**< Default value that defines how many times K-Means should be run to improve parameters. */

const static std::size_t DEFAULT_CANDIDATES; /**< Default value of amount of candidates to consider by K-Means++ to initialize initial centers for K-Means on each iteration. */
const static std::size_t DEFAULT_CANDIDATES; /**< Default value of amount of candidates to consider by K-Means++ to initialize initial centers for K-Means on each iteration. */

private:
std::size_t m_amount = DEFAULT_AMOUNT_CENTERS;
Expand All @@ -125,6 +125,8 @@ class gmeans {

long long m_kmax = IGNORE_KMAX;

long long m_random_state = RANDOM_STATE_CURRENT_TIME;

gmeans_data * m_ptr_result = nullptr; /* temporary pointer to output result */

const dataset * m_ptr_data = nullptr; /* used only during processing */
Expand All @@ -149,13 +151,15 @@ class gmeans {
with larger 'repeat' values suggesting higher probability of finding global optimum.
@param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
amount of clusters is not restricted (`k_max` is -1).
amount of clusters is not restricted (`k_max` is `IGNORE_KMAX`).
@param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*/
gmeans(const std::size_t p_k_initial,
const double p_tolerance = DEFAULT_TOLERANCE,
const std::size_t p_repeat = DEFAULT_REPEAT,
const long long p_kmax = IGNORE_KMAX);
const long long p_kmax = IGNORE_KMAX,
const long long p_random_state = RANDOM_STATE_CURRENT_TIME);

/*!
Expand Down
22 changes: 17 additions & 5 deletions ccore/include/pyclustering/cluster/kmeans_plus_plus.hpp
Expand Up @@ -25,6 +25,7 @@
#pragma once


#include <random>
#include <unordered_set>

#include <pyclustering/definitions.hpp>
Expand Down Expand Up @@ -80,9 +81,11 @@ class kmeans_plus_plus : public center_initializer {
using store_result = std::function<void(center_description &)>;

private:
std::size_t m_amount = 0;
std::size_t m_candidates = 0;
metric m_dist_func;
std::size_t m_amount = 0;
std::size_t m_candidates = 0;
metric m_dist_func;
long long m_random_state = RANDOM_STATE_CURRENT_TIME;
mutable std::mt19937 m_generator;

/* temporal members that are used only during initialization */
mutable dataset const * m_data_ptr = nullptr;
Expand All @@ -107,11 +110,12 @@ class kmeans_plus_plus : public center_initializer {
* @param[in] p_candidates: amount of candidates that are considered to find the best center, if
* the farthest candidate is required (with highest probability) than static constant
* FARTHEST_CENTER_CANDIDATE can be specified.
* @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*
* @see FARTHEST_CENTER_CANDIDATE
*
*/
kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates = 1) noexcept;
kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates = 1, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;

/**
*
Expand All @@ -123,11 +127,12 @@ class kmeans_plus_plus : public center_initializer {
* the farthest candidate is required (with highest probability) than static constant
* FARTHEST_CENTER_CANDIDATE can be specified.
* @param[in] p_metric: metric for distance calculation between points.
* @param[in] p_random_state: seed for random state (by default is `RANDOM_STATE_CURRENT_TIME`, current system time is used).
*
* @see FARTHEST_CENTER_CANDIDATE
*
*/
kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric) noexcept;
kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric, const long long p_random_state = RANDOM_STATE_CURRENT_TIME) noexcept;

/**
*
Expand Down Expand Up @@ -185,6 +190,13 @@ class kmeans_plus_plus : public center_initializer {
void initialize(const dataset & p_data, index_sequence & p_center_indexes) const;

private:
/**
*
* @brief Assigns seed to the random generator that is used by the algorithm.
*
*/
void initialize_random_generator();

/**
*
* @brief Performs center initialization process in line algorithm configuration.
Expand Down
3 changes: 3 additions & 0 deletions ccore/include/pyclustering/definitions.hpp
Expand Up @@ -49,6 +49,9 @@
namespace pyclustering {


constexpr long long RANDOM_STATE_CURRENT_TIME = -1; /**< Defines value of the random state that indicates to the algorithm to use current system time as a seed for random functionality. */


/*!
@brief Defines a patten that consists of features that describe this pattern.
Expand Down
38 changes: 20 additions & 18 deletions ccore/include/pyclustering/interface/gmeans_interface.h
Expand Up @@ -42,25 +42,27 @@ enum gmeans_package_indexer {
};


/**
*
* @brief Clustering algorithm G-Means returns allocated clusters.
* @details Caller should destroy returned result in 'pyclustering_package'.
*
* @param[in] p_sample: input data for clustering.
* @param[in] p_amount: initial amount of centers.
* @param[in] p_tolerance: stop condition - when changes of medians are less then tolerance value.
* @param[in] p_repeat: how many times K-Means should be run to improve parameters, with larger 'repeat'
* values suggesting higher probability of finding global optimum.
* @param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
amount of clusters is not restricted (`k_max` is -1).
*
* @return Returns result of clustering - array of allocated clusters.
*
*/
/*!
@brief Clustering algorithm G-Means returns allocated clusters.
@details Caller should destroy returned result in 'pyclustering_package'.
@param[in] p_sample: input data for clustering.
@param[in] p_amount: initial amount of centers.
@param[in] p_tolerance: stop condition - when changes of medians are less then tolerance value.
@param[in] p_repeat: how many times K-Means should be run to improve parameters, with larger 'repeat'
values suggesting higher probability of finding global optimum.
@param[in] p_kmax: maximum amount of cluster that might be allocated. The argument is considered as a stop
condition. When the maximum amount is reached then algorithm stops processing. By default the maximum
amount of clusters is not restricted (`k_max` is -1).
@param[in] p_random_state: seed for random state (by default is `None`, current system time is used).
@return Returns result of clustering - array of allocated clusters.
*/
extern "C" DECLARATION pyclustering_package * gmeans_algorithm(const pyclustering_package * const p_sample,
const std::size_t p_amount,
const double p_tolerance,
const std::size_t p_repeat,
const long long p_kmax);
const long long p_kmax,
const long long p_random_state);
19 changes: 11 additions & 8 deletions ccore/src/cluster/gmeans.cpp
Expand Up @@ -47,22 +47,23 @@ namespace pyclustering {
namespace clst {


const long long gmeans::IGNORE_KMAX = -1;
const long long gmeans::IGNORE_KMAX = -1;

const std::size_t gmeans::DEFAULT_AMOUNT_CENTERS = 1;
const std::size_t gmeans::DEFAULT_AMOUNT_CENTERS = 1;

const double gmeans::DEFAULT_TOLERANCE = 0.001;
const double gmeans::DEFAULT_TOLERANCE = 0.001;

const std::size_t gmeans::DEFAULT_REPEAT = 3;
const std::size_t gmeans::DEFAULT_REPEAT = 3;

const std::size_t gmeans::DEFAULT_CANDIDATES = 3;
const std::size_t gmeans::DEFAULT_CANDIDATES = 3;


gmeans::gmeans(const std::size_t p_k_initial, const double p_tolerance, const std::size_t p_repeat, const long long p_kmax) :
gmeans::gmeans(const std::size_t p_k_initial, const double p_tolerance, const std::size_t p_repeat, const long long p_kmax, const long long p_random_state) :
m_amount(p_k_initial),
m_tolerance(p_tolerance),
m_repeat(p_repeat),
m_kmax(p_kmax),
m_random_state(p_random_state),
m_ptr_result(nullptr),
m_ptr_data(nullptr)
{ }
Expand Down Expand Up @@ -107,7 +108,7 @@ void gmeans::search_optimal_parameters(const dataset & p_data, const std::size_t

for (std::size_t i = 0; i < m_repeat; i++) {
dataset initial_centers;
kmeans_plus_plus(p_amount, get_amount_candidates(p_data)).initialize(p_data, initial_centers);
kmeans_plus_plus(p_amount, get_amount_candidates(p_data), m_random_state).initialize(p_data, initial_centers);

kmeans_data result;
kmeans(initial_centers, m_tolerance).process(p_data, result);
Expand All @@ -130,16 +131,18 @@ void gmeans::search_optimal_parameters(const dataset & p_data, const std::size_t

void gmeans::statistical_optimization() {
dataset centers;
long long potential_amount_clusters = static_cast<long long>(m_ptr_result->clusters().size());
for (std::size_t i = 0; i < m_ptr_result->clusters().size(); i++) {
dataset new_centers;
split_and_search_optimal(m_ptr_result->clusters().at(i), new_centers);

if (new_centers.empty()) {
if (new_centers.empty() || ((m_kmax != IGNORE_KMAX) && (potential_amount_clusters >= m_kmax))) {
centers.push_back(std::move(m_ptr_result->centers().at(i)));
}
else {
centers.push_back(std::move(new_centers[0]));
centers.push_back(std::move(new_centers[1]));
potential_amount_clusters++;
}
}

Expand Down
43 changes: 26 additions & 17 deletions ccore/src/cluster/kmeans_plus_plus.cpp
Expand Up @@ -28,7 +28,6 @@
#include <exception>
#include <limits>
#include <numeric>
#include <random>
#include <string>


Expand All @@ -41,20 +40,38 @@ const std::size_t kmeans_plus_plus::FARTHEST_CENTER_CANDIDATE = std::numeric_lim
const std::size_t kmeans_plus_plus::INVALID_INDEX = std::numeric_limits<std::size_t>::max();


kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates) noexcept :
kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const long long p_random_state) noexcept :
m_amount(p_amount),
m_candidates(p_candidates),
m_dist_func([](const point &p1, const point &p2) {
return euclidean_distance_square(p1, p2);
})
{ }
}),
m_random_state(p_random_state),
m_generator(std::random_device()())
{
initialize_random_generator();
}


kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric) noexcept :
kmeans_plus_plus::kmeans_plus_plus(const std::size_t p_amount, const std::size_t p_candidates, const metric & p_metric, const long long p_random_state) noexcept :
m_amount(p_amount),
m_candidates(p_candidates),
m_dist_func(p_metric)
{ }
m_dist_func(p_metric),
m_random_state(p_random_state),
m_generator(std::random_device()())
{
initialize_random_generator();
}


void kmeans_plus_plus::initialize_random_generator() {
if (m_random_state == RANDOM_STATE_CURRENT_TIME) {
m_generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));
}
else {
m_generator.seed(static_cast<unsigned int>(m_random_state));
}
}


void kmeans_plus_plus::initialize(const dataset & p_data, dataset & p_centers) const {
Expand Down Expand Up @@ -151,14 +168,9 @@ void kmeans_plus_plus::free_temporal_params() const {
kmeans_plus_plus::center_description kmeans_plus_plus::get_first_center() const {
std::size_t length = m_indexes_ptr->empty() ? m_data_ptr->size() : m_indexes_ptr->size();

std::random_device random_device;

std::mt19937 generator(random_device());
generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));

std::uniform_int_distribution<std::size_t> distribution(0, length - 1);

std::size_t index = distribution(generator);
std::size_t index = distribution(m_generator);
const auto & center = m_indexes_ptr->empty() ? (*m_data_ptr)[index] : (*m_data_ptr)[ (*m_indexes_ptr)[index] ];

return std::make_tuple(center, index);
Expand Down Expand Up @@ -238,15 +250,12 @@ void kmeans_plus_plus::calculate_probabilities(const std::vector<double> & p_dis


std::size_t kmeans_plus_plus::get_probable_center(const std::vector<double> & p_distances, const std::vector<double> & p_probabilities) const {
std::default_random_engine generator;
generator.seed(static_cast<unsigned int>(std::chrono::system_clock::now().time_since_epoch().count()));

std::uniform_real_distribution<double> distribution(0.0, 1.0);

std::size_t best_index_candidate = 0;
for (std::size_t i = 0; i < m_candidates; i++) {
std::size_t current_index_candidate = kmeans_plus_plus::INVALID_INDEX;
double candidate_probability = distribution(generator);
double candidate_probability = distribution(m_generator);
for (std::size_t j = 0; j < p_probabilities.size(); j++) {
if (candidate_probability < p_probabilities[j]) {
current_index_candidate = j;
Expand Down
5 changes: 3 additions & 2 deletions ccore/src/interface/gmeans_interface.cpp
Expand Up @@ -29,12 +29,13 @@ pyclustering_package * gmeans_algorithm(const pyclustering_package * const p_sam
const std::size_t p_amount,
const double p_tolerance,
const std::size_t p_repeat,
const long long p_kmax)
const long long p_kmax,
const long long p_random_state)
{
pyclustering::dataset data;
p_sample->extract(data);

pyclustering::clst::gmeans algorithm(p_amount, p_tolerance, p_repeat, p_kmax);
pyclustering::clst::gmeans algorithm(p_amount, p_tolerance, p_repeat, p_kmax, p_random_state);

pyclustering::clst::gmeans_data output_result;
algorithm.process(data, output_result);
Expand Down

0 comments on commit ab1cf9c

Please sign in to comment.