Skip to content

Commit

Permalink
#667: 'get_iterations()' and 'get_total_deviation()' methods are intr…
Browse files Browse the repository at this point in the history
…oduced.
  • Loading branch information
annoviko committed Feb 12, 2021
1 parent a475f11 commit 3806e74
Show file tree
Hide file tree
Showing 14 changed files with 123 additions and 24 deletions.
16 changes: 16 additions & 0 deletions ccore/include/pyclustering/cluster/kmedoids_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class kmedoids_data : public cluster_data {

std::size_t m_iterations = 0;

double m_total_deviation = 0.0;

public:
/**
*
Expand Down Expand Up @@ -96,6 +98,20 @@ class kmedoids_data : public cluster_data {
*/
std::size_t iterations() const { return m_iterations; }

/*
@brief Returns reference to the final loss (total deviation).
*/
double & total_deviation() { return m_total_deviation; }

/*
@brief Returns the final loss (total deviation).
*/
double total_deviation() const { return m_total_deviation; }
};


Expand Down
1 change: 1 addition & 0 deletions ccore/include/pyclustering/interface/kmedoids_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ enum kmedoids_package_indexer {
KMEDOIDS_PACKAGE_INDEX_CLUSTERS = 0,
KMEDOIDS_PACKAGE_INDEX_MEDOIDS,
KMEDOIDS_PACKAGE_INDEX_ITERATIONS,
KMEDOIDS_PACKAGE_INDEX_TOTAL_DEVIATION,
KMEDOIDS_PACKAGE_SIZE
};

Expand Down
15 changes: 8 additions & 7 deletions ccore/src/cluster/kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,20 @@ void kmedoids::process(const dataset & p_data, const data_t p_type, kmedoids_dat

double changes = std::numeric_limits<double>::max();
double previous_deviation = std::numeric_limits<double>::max();
double current_deviation = std::numeric_limits<double>::max();
p_result.total_deviation() = 0;

if (m_itermax > 0) {
current_deviation = update_clusters();
p_result.total_deviation() = update_clusters();
}

for (p_result.iterations() = 0; (p_result.iterations() < m_itermax) && (changes > m_tolerance); p_result.iterations()++) {
for (p_result.iterations() = 0; (p_result.iterations() < m_itermax) && (changes > m_tolerance);) {
p_result.iterations()++;
const double swap_cost = swap_medoids();

if (swap_cost != NOTHING_TO_SWAP) {
previous_deviation = current_deviation;
current_deviation = update_clusters();
changes = previous_deviation - current_deviation;
previous_deviation = p_result.total_deviation();
p_result.total_deviation() = update_clusters();
changes = previous_deviation - p_result.total_deviation();
}
else {
break;
Expand Down Expand Up @@ -143,7 +144,7 @@ kmedoids::distance_calculator kmedoids::create_distance_calculator(const data_t
};
}
else {
throw std::invalid_argument("Unknown type data is specified");
throw std::invalid_argument("Unknown type data is specified (data type code: '" + std::to_string(static_cast<std::size_t>(p_type)) + "') .");
}
}

Expand Down
8 changes: 7 additions & 1 deletion ccore/src/interface/kmedoids_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pyclustering_package * kmedoids_algorithm(const pyclustering_package * const p_s
const std::size_t p_itermax,
const void * const p_metric,
const std::size_t p_type)
try
{
pyclustering::clst::medoid_sequence medoids;
p_medoids->extract(medoids);
Expand All @@ -48,6 +49,11 @@ pyclustering_package * kmedoids_algorithm(const pyclustering_package * const p_s
std::vector<std::size_t> iteration_storage(1, output_result.iterations());
((pyclustering_package **)package->data)[KMEDOIDS_PACKAGE_INDEX_ITERATIONS] = create_package(&iteration_storage);

std::vector<double> total_deviation_storage(1, output_result.total_deviation());
((pyclustering_package **)package->data)[KMEDOIDS_PACKAGE_INDEX_TOTAL_DEVIATION] = create_package(&total_deviation_storage);

return package;
}

catch (std::exception & p_exception) {
return create_package(p_exception.what());
}
2 changes: 2 additions & 0 deletions ccore/src/pyclustering-shared.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
<PreprocessorDefinitions>EXPORT_PYCLUSTERING_INTERFACE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>false</MinimalRebuild>
<OpenMPSupport>false</OpenMPSupport>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<AdditionalDependencies>pyclustering-static.lib</AdditionalDependencies>
Expand All @@ -123,6 +124,7 @@
<PreprocessorDefinitions>EXPORT_PYCLUSTERING_INTERFACE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>false</MinimalRebuild>
<OpenMPSupport>false</OpenMPSupport>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<AdditionalDependencies>pyclustering-static.lib</AdditionalDependencies>
Expand Down
2 changes: 2 additions & 0 deletions ccore/src/pyclustering-static.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
<MinimalRebuild>false</MinimalRebuild>
<OpenMPSupport>false</OpenMPSupport>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Lib>
<TreatLibWarningAsErrors>true</TreatLibWarningAsErrors>
Expand All @@ -115,6 +116,7 @@
<MinimalRebuild>false</MinimalRebuild>
<OpenMPSupport>false</OpenMPSupport>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Lib>
<TreatLibWarningAsErrors>true</TreatLibWarningAsErrors>
Expand Down
2 changes: 2 additions & 0 deletions ccore/tst/ut-shared.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>$(SolutionDir)include;$(SolutionDir)external\include</AdditionalIncludeDirectories>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
Expand All @@ -107,6 +108,7 @@ copy "$(OutDir)pyclustering.dll" "..\tst\pyclustering.dll" /y;</Command>
<SDLCheck>true</SDLCheck>
<AdditionalIncludeDirectories>$(SolutionDir)include;$(SolutionDir)external\include</AdditionalIncludeDirectories>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
<MinimalRebuild>false</MinimalRebuild>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
Expand Down
3 changes: 3 additions & 0 deletions ccore/tst/utest-interface-kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ TEST(utest_interface_kmedoids, kmedoids_api) {
distance_metric<point> metric = distance_metric_factory<point>::euclidean_square();

pyclustering_package * kmedoids_result = kmedoids_algorithm(sample.get(), medoids.get(), 0.001, 100, &metric, 0);

ASSERT_NE(nullptr, kmedoids_result);
ASSERT_GT(((std::size_t *)((pyclustering_package **)kmedoids_result->data)[KMEDOIDS_PACKAGE_INDEX_ITERATIONS])[0], 0);
ASSERT_GT(((double *)((pyclustering_package **)kmedoids_result->data)[KMEDOIDS_PACKAGE_INDEX_TOTAL_DEVIATION])[0], 0.0);

delete kmedoids_result;
}
Expand Down
38 changes: 37 additions & 1 deletion ccore/tst/utest-kmedoids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,29 @@ template_kmedoids_length_process_data(const dataset_ptr p_data,
const medoid_sequence & medoids = output_result.medoids();

if (p_itermax == 0) {
ASSERT_EQ(0.0, output_result.total_deviation());
ASSERT_EQ(0, output_result.iterations());
ASSERT_TRUE(actual_clusters.empty());
ASSERT_EQ(p_start_medoids, medoids);
return;
}

double expected_total_deviation = 0.0;
for (std::size_t index_cluster = 0; index_cluster < actual_clusters.size(); index_cluster++) {
const auto index_point_medoid = medoids[index_cluster];

for (const std::size_t index_point : actual_clusters[index_cluster]) {
if (index_point_medoid == index_point) {
continue;
}

expected_total_deviation += p_metric(p_data->at(index_point_medoid), p_data->at(index_point));
}
}

ASSERT_GT(output_result.iterations(), 0);
ASSERT_NEAR(expected_total_deviation, output_result.total_deviation(), 0.000001);

ASSERT_LE(medoids.size(), p_start_medoids.size());
ASSERT_EQ(medoids.size(), actual_clusters.size());
ASSERT_CLUSTER_SIZES(data, actual_clusters, p_expected_cluster_length);
Expand All @@ -55,7 +73,7 @@ template_kmedoids_length_process_distance_matrix(const dataset_ptr p_data,
const distance_metric<point> & p_metric = distance_metric_factory<point>::euclidean_square()) {

dataset matrix;
distance_matrix(*p_data, matrix);
distance_matrix(*p_data, p_metric, matrix);

kmedoids_data output_result;
kmedoids solver(p_start_medoids, kmedoids::DEFAULT_TOLERANCE, p_itermax, p_metric);
Expand All @@ -66,11 +84,29 @@ template_kmedoids_length_process_distance_matrix(const dataset_ptr p_data,
const medoid_sequence & medoids = output_result.medoids();

if (p_itermax == 0) {
ASSERT_EQ(0.0, output_result.total_deviation());
ASSERT_EQ(0, output_result.iterations());
ASSERT_TRUE(actual_clusters.empty());
ASSERT_EQ(p_start_medoids, medoids);
return;
}

double expected_total_deviation = 0.0;
for (std::size_t index_cluster = 0; index_cluster < actual_clusters.size(); index_cluster++) {
const auto index_point_medoid = medoids[index_cluster];

for (const std::size_t index_point : actual_clusters[index_cluster]) {
if (index_point_medoid == index_point) {
continue;
}

expected_total_deviation += p_metric(p_data->at(index_point_medoid), p_data->at(index_point));
}
}

ASSERT_GT(output_result.iterations(), 0);
ASSERT_NEAR(expected_total_deviation, output_result.total_deviation(), 0.000001);

ASSERT_EQ(p_start_medoids.size(), actual_clusters.size());
ASSERT_EQ(p_start_medoids.size(), medoids.size());
ASSERT_CLUSTER_SIZES(data, actual_clusters, p_expected_cluster_length);
Expand Down
6 changes: 1 addition & 5 deletions docs/doxygen_conf_pyclustering
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ INHERIT_DOCS = YES
SEPARATE_MEMBER_PAGES = NO
TAB_SIZE = 4
ALIASES =
TCL_SUBST =
OPTIMIZE_OUTPUT_FOR_C = NO
OPTIMIZE_OUTPUT_JAVA = NO
OPTIMIZE_FOR_FORTRAN = NO
Expand Down Expand Up @@ -186,7 +185,6 @@ VERBATIM_HEADERS = YES
# Configuration options related to the alphabetical class index
#---------------------------------------------------------------------------
ALPHABETICAL_INDEX = YES
COLS_IN_ALPHA_INDEX = 5
IGNORE_PREFIX =
#---------------------------------------------------------------------------
# Configuration options related to the HTML output
Expand Down Expand Up @@ -254,7 +252,7 @@ LATEX_OUTPUT =
LATEX_CMD_NAME = latex
MAKEINDEX_CMD_NAME = makeindex
COMPACT_LATEX = NO
PAPER_TYPE = a4wide
PAPER_TYPE = a4
EXTRA_PACKAGES =
LATEX_HEADER =
LATEX_FOOTER =
Expand Down Expand Up @@ -323,12 +321,10 @@ GENERATE_TAGFILE =
ALLEXTERNALS = NO
EXTERNAL_GROUPS = YES
EXTERNAL_PAGES = YES
PERL_PATH = /usr/bin/perl
#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------
CLASS_DIAGRAMS = NO
MSCGEN_PATH =
DIA_PATH =
HIDE_UNDOC_RELATIONS = YES
HAVE_DOT = YES
Expand Down
8 changes: 7 additions & 1 deletion pyclustering/cluster/center_initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,13 @@ def initialize(self, **kwargs):
"""

return_index = kwargs.get('return_index', False)
return_index = kwargs.get('return_index', None)
if return_index is None:
return_index = (self.__data_type == 'distance_matrix')

if (return_index is False) and (self.__data_type == 'distance_matrix'):
raise ValueError("In case of 'distance_matrix' data type, parameter 'return_index' cannot be 'False'. "
"Please, use 'return_index=True' in case of 'distance_matrix' data type.")

index_point = self.__get_initial_center(True)
centers = [index_point]
Expand Down
27 changes: 20 additions & 7 deletions pyclustering/cluster/kmedoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def __init__(self, data, initial_index_medoids, tolerance=0.0001, ccore=True, **
self.__distance_second_medoid = [float('inf')] * len(data)
self.__tolerance = tolerance
self.__iterations = 0
self.__total_deviation = float('inf')

self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
self.__data_type = kwargs.get('data_type', 'points')
Expand Down Expand Up @@ -366,28 +367,29 @@ def process(self):

if self.__ccore is True:
ccore_metric = metric_wrapper.create_instance(self.__metric)
self.__clusters, self.__medoid_indexes = kmedoids_wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)
self.__clusters, self.__medoid_indexes, self.__iterations, self.__total_deviation = kmedoids_wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, self.__itermax, ccore_metric.get_pointer(), self.__data_type)

else:
changes = float('inf')
previous_deviation, current_deviation = float('inf'), float('inf')
previous_deviation, self.__total_deviation = float('inf'), 0

self.__iterations = 0

if self.__itermax > 0:
current_deviation = self.__update_clusters()
self.__total_deviation = self.__update_clusters()

while (changes > self.__tolerance) and (self.__iterations < self.__itermax):
self.__iterations += 1
swap_cost = self.__swap_medoids()

if swap_cost != float('inf'):
previous_deviation = current_deviation
current_deviation = self.__update_clusters()
changes = previous_deviation - current_deviation
previous_deviation = self.__total_deviation
self.__total_deviation = self.__update_clusters()
changes = previous_deviation - self.__total_deviation
else:
break

self.__iterations += 1


self.__erase_empty_clusters()

Expand Down Expand Up @@ -491,6 +493,17 @@ def get_iterations(self):
return self.__iterations


def get_total_deviation(self):
"""!
@brief Returns total deviation - the final loss after optimization.
@return (float) The total deviation - the final loss after optimization.
@see process()
"""
return self.__total_deviation


def get_cluster_encoding(self):
"""!
@brief Returns clustering result representation type that indicate how clusters are encoded.
Expand Down
17 changes: 16 additions & 1 deletion pyclustering/cluster/tests/kmedoids_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_clus

input_data = sample
if data_type == 'distance_matrix':
input_data = calculate_distance_matrix(sample)
input_data = calculate_distance_matrix(sample, metric)

if input_type == 'numpy':
input_data = numpy.array(input_data)
Expand All @@ -65,6 +65,8 @@ def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_clus
medoids = kmedoids_instance.get_medoids()

if itermax == 0:
assertion.eq(0, kmedoids_instance.get_iterations())
assertion.eq(0.0, kmedoids_instance.get_total_deviation())
assertion.eq([], clusters)
assertion.eq(medoids, initial_medoids)
return
Expand All @@ -89,6 +91,19 @@ def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_clus
if obtained_cluster_sizes != expected_cluster_length:
continue

assertion.gt(kmedoids_instance.get_iterations(), 0)

expected_total_deviation = 0.0
for index_cluster in range(len(clusters)):
index_point_medoid = medoids[index_cluster]
for index_point in clusters[index_cluster]:
if index_point == index_point_medoid:
continue

expected_total_deviation += metric(sample[index_point_medoid], sample[index_point])

assertion.eq_float(expected_total_deviation, kmedoids_instance.get_total_deviation(), 0.000001)

testing_result = True

assertion.true(testing_result)
Expand Down
2 changes: 1 addition & 1 deletion pyclustering/core/kmedoids_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ def kmedoids(sample, medoids, tolerance, itermax, metric_pointer, data_type):
result = package_extractor(package).extract()
ccore.free_pyclustering_package(package)

return result[0], result[1]
return result[0], result[1], result[2][0], result[3][0]

0 comments on commit 3806e74

Please sign in to comment.