Skip to content

Commit

Permalink
#667: Additional C++ optimizations for PAM BUILD and PAM.
Browse files Browse the repository at this point in the history
  • Loading branch information
annoviko committed Feb 12, 2021
1 parent fe7f37a commit f573361
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 38 deletions.
8 changes: 5 additions & 3 deletions ccore/include/pyclustering/cluster/kmedoids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,13 @@ class kmedoids {
@brief Creates distance calcultor in line with data type and distance metric metric.

@param[in] p_type: data type (points or distance matrix).

@return Distance calculator.

*/
distance_calculator create_distance_calculator(const data_t p_type);
void create_distance_calculator(const data_t p_type);

double calculate_distance_using_distance_matrix(const std::size_t p_index1, const std::size_t p_index2) const;

double calculate_distance_using_points(const std::size_t p_index1, const std::size_t p_index2) const;

/*!

Expand Down
6 changes: 5 additions & 1 deletion ccore/include/pyclustering/cluster/pam_build.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,11 @@ class pam_build {

void calculate_next_medoids() const;

pam_build::distance_calculator create_distance_calculator(const data_t p_type) const;
double calculate_distance_using_distance_matrix(const std::size_t p_index1, const std::size_t p_index2) const;

double calculate_distance_using_points(const std::size_t p_index1, const std::size_t p_index2) const;

void create_distance_calculator(const data_t p_type) const;
};


Expand Down
2 changes: 2 additions & 0 deletions ccore/include/pyclustering/interface/pyclustering_package.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ struct DECLARATION pyclustering_package {
throw std::invalid_argument("pyclustering_package::extract() [" + std::to_string(__LINE__) + "]: argument is not 'PYCLUSTERING_TYPE_LIST').");
}

container.reserve(size);
for (std::size_t i = 0; i < size; i++) {
std::vector<TypeValue> subcontainer = { };
extract(subcontainer, at<pyclustering_package *>(i));
Expand All @@ -167,6 +168,7 @@ struct DECLARATION pyclustering_package {
*/
template <class TypeValue>
void extract(std::vector<TypeValue> & container, const pyclustering_package * const package) const {
container.reserve(package->size);
for (std::size_t i = 0; i < package->size; i++) {
container.push_back(package->at<TypeValue>(i));
}
Expand Down
4 changes: 2 additions & 2 deletions ccore/include/pyclustering/utils/metric.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ double euclidean_distance_square(const TypeContainer & point1, const TypeContain
double distance = 0.0;
typename TypeContainer::const_iterator iter_point1 = point1.begin();

for (const auto & dim_point2 : point2) {
double difference = (*iter_point1 - dim_point2);
for (const auto dim_point2 : point2) {
const double difference = (*iter_point1 - dim_point2);
distance += difference * difference;

++iter_point1;
Expand Down
24 changes: 15 additions & 9 deletions ccore/src/cluster/kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ void kmedoids::process(const dataset & p_data, kmedoids_data & p_result) {
void kmedoids::process(const dataset & p_data, const data_t p_type, kmedoids_data & p_result) {
m_data_ptr = &p_data;
m_result_ptr = (kmedoids_data *) &p_result;
m_calculator = create_distance_calculator(p_type);
create_distance_calculator(p_type);

medoid_sequence & medoids = m_result_ptr->medoids();
medoids.assign(m_initial_medoids.begin(), m_initial_medoids.end());
Expand Down Expand Up @@ -132,19 +132,25 @@ double kmedoids::update_clusters() {
}


kmedoids::distance_calculator kmedoids::create_distance_calculator(const data_t p_type) {
double kmedoids::calculate_distance_using_points(const std::size_t p_index1, const std::size_t p_index2) const {
return m_metric((*m_data_ptr)[p_index1], (*m_data_ptr)[p_index2]);
}


double kmedoids::calculate_distance_using_distance_matrix(const std::size_t p_index1, const std::size_t p_index2) const {
return (*m_data_ptr)[p_index1][p_index2];
}


void kmedoids::create_distance_calculator(const data_t p_type) {
if (p_type == data_t::POINTS) {
return [this](const std::size_t index1, const std::size_t index2) {
return m_metric((*m_data_ptr)[index1], (*m_data_ptr)[index2]);
};
m_calculator = std::bind(&kmedoids::calculate_distance_using_points, this, std::placeholders::_1, std::placeholders::_2);
}
else if (p_type == data_t::DISTANCE_MATRIX) {
return [this](const std::size_t index1, const std::size_t index2) {
return (*m_data_ptr)[index1][index2];
};
m_calculator = std::bind(&kmedoids::calculate_distance_using_distance_matrix, this, std::placeholders::_1, std::placeholders::_2);
}
else {
throw std::invalid_argument("Unknown type data is specified (data type code: '" + std::to_string(static_cast<std::size_t>(p_type)) + "') .");
throw std::invalid_argument("Unknown type data is specified (type code: '" + std::to_string(static_cast<std::size_t>(p_type)) + "').");
}
}

Expand Down
40 changes: 23 additions & 17 deletions ccore/src/cluster/pam_build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void pam_build::initialize(const dataset & p_data, const data_t p_type, const me
m_data_ptr = (dataset *) &p_data;
m_medoids_ptr = (medoids *) &p_medoids;

m_calculator = create_distance_calculator(p_type);
create_distance_calculator(p_type);
m_distance_closest_medoid = std::vector<double>(p_data.size(), 0.0);

calculate_first_medoid();
Expand Down Expand Up @@ -85,27 +85,27 @@ void pam_build::calculate_next_medoids() const {
std::vector<double> optimal_distances(m_data_ptr->size(), 0.0);
std::vector<double> current_distances(m_data_ptr->size(), 0.0);

std::unordered_set<std::size_t> non_available = { m_medoids_ptr->at(0) };
std::vector<bool> medoids(m_data_ptr->size(), false);
medoids[m_medoids_ptr->at(0)] = true;

while (m_medoids_ptr->size() < m_amount) {
std::size_t optimal_medoid = INVALID_MEDOID;
double optimal_deviation = std::numeric_limits<double>::max();

for (std::size_t i = 0; i < m_data_ptr->size(); i++) {
if (non_available.count(i) > 0) {
for (std::size_t i = 0; i < m_data_ptr->size(); ++i) {
if (medoids[i]) {
continue; /* already assigned as a medoid */
}

double total_deviation = 0.0;
for (std::size_t j = 0; j < m_data_ptr->size(); j++) {
if ((i == j) || (non_available.count(j) > 0)) {
for (std::size_t j = 0; j < m_data_ptr->size(); ++j) {
if ((i == j) || (medoids[j])) {
current_distances[j] = 0;
continue;
}

const double distance = std::min(m_calculator(i, j), m_distance_closest_medoid[j]);
total_deviation += distance;
current_distances[j] = distance;
current_distances[j] = std::min(m_calculator(i, j), m_distance_closest_medoid[j]);
total_deviation += current_distances[j];
}

if (total_deviation < optimal_deviation) {
Expand All @@ -120,22 +120,28 @@ void pam_build::calculate_next_medoids() const {
}

m_medoids_ptr->push_back(optimal_medoid);
non_available.insert(optimal_medoid);
medoids[optimal_medoid] = true;
std::swap(m_distance_closest_medoid, optimal_distances);
}
}


pam_build::distance_calculator pam_build::create_distance_calculator(const data_t p_type) const {
double pam_build::calculate_distance_using_points(const std::size_t p_index1, const std::size_t p_index2) const {
return m_metric((*m_data_ptr)[p_index1], (*m_data_ptr)[p_index2]);
}


double pam_build::calculate_distance_using_distance_matrix(const std::size_t p_index1, const std::size_t p_index2) const {
return (*m_data_ptr)[p_index1][p_index2];
}


void pam_build::create_distance_calculator(const data_t p_type) const {
if (p_type == data_t::POINTS) {
return [this](const std::size_t index1, const std::size_t index2) {
return m_metric((*m_data_ptr)[index1], (*m_data_ptr)[index2]);
};
m_calculator = std::bind(&pam_build::calculate_distance_using_points, this, std::placeholders::_1, std::placeholders::_2);
}
else if (p_type == data_t::DISTANCE_MATRIX) {
return [this](const std::size_t index1, const std::size_t index2) {
return (*m_data_ptr)[index1][index2];
};
m_calculator = std::bind(&pam_build::calculate_distance_using_distance_matrix, this, std::placeholders::_1, std::placeholders::_2);
}
else {
throw std::invalid_argument("Unknown type data is specified (type code: '" + std::to_string(static_cast<std::size_t>(p_type)) + "').");
Expand Down
17 changes: 12 additions & 5 deletions ccore/tst/utest-kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,20 +422,27 @@ TEST(utest_kmedoids, itermax_10_simple02) {
#include <chrono>

TEST(utest_kmedoids, big_data) {
const std::size_t cluster_length = 100;
const std::size_t amount_clusters = 10;
auto p_data = fcps_sample_factory::create_sample(FCPS_SAMPLE::ENGY_TIME);

auto points = simple_sample_factory::create_random_sample(cluster_length, amount_clusters);
dataset data;
auto p_data_type = data_t::DISTANCE_MATRIX;

medoid_sequence start_medoids = { 10, cluster_length, cluster_length * 2, cluster_length * 3, cluster_length * 4, cluster_length * 5 };
if (p_data_type == data_t::POINTS) {
data = *p_data;
}
else {
distance_matrix(*p_data, data);
}

medoid_sequence start_medoids = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 };

auto start = std::chrono::system_clock::now();

const std::size_t repeat = 1;
for (std::size_t i = 0; i < repeat; i++) {
kmedoids_data output_result;
kmedoids solver(start_medoids, 0.0001);
solver.process(*points, output_result);
solver.process(data, p_data_type, output_result);
}

auto end = std::chrono::system_clock::now();
Expand Down
22 changes: 22 additions & 0 deletions ccore/tst/utest-pam_build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,25 @@ TEST(utest_pam_build, correct_medoids_three_dimensional_distance_matrix) {
template_pam_build_medoids(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_11), 1, { 15 }, data_t::DISTANCE_MATRIX);
template_pam_build_medoids(simple_sample_factory::create_sample(SAMPLE_SIMPLE::SAMPLE_SIMPLE_11), 3, { 15, 4, 14 }, data_t::DISTANCE_MATRIX);
}


#if 0
TEST(utest_pam_build, test_performance) {
auto p_data = fcps_sample_factory::create_sample(FCPS_SAMPLE::ENGY_TIME);

dataset data;
auto p_data_type = data_t::DISTANCE_MATRIX;

if (p_data_type == data_t::POINTS) {
data = *p_data;
}
else {
distance_matrix(*p_data, data);
}

medoids medoids;
pam_build(10).initialize(data, p_data_type, medoids);

ASSERT_EQ(10, medoids.size());
}
#endif
2 changes: 1 addition & 1 deletion pyclustering/core/pam_build_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
from pyclustering.core.wrapper import ccore_library
from pyclustering.core.pyclustering_package import pyclustering_package, package_builder, package_extractor


def pam_build(sample, amount, pointer_metric, data_type):
pointer_data = package_builder(sample, c_double).create()
c_data_type = convert_data_type(data_type)

ccore = ccore_library.get()
ccore.pam_build_algorithm.restype = POINTER(pyclustering_package)

package = ccore.pam_build_algorithm(pointer_data, c_size_t(amount), pointer_metric, c_data_type)

results = package_extractor(package).extract()
Expand Down

0 comments on commit f573361

Please sign in to comment.