#589, #588, #587, #379: Balanced KD-tree and optimization of DBSCA, O…

…PTICS, CURE.
annoviko · Feb 21, 2020 · 256f228 · 256f228
1 parent b51dff1
commit 256f228
Show file tree

Hide file tree

Showing 13 changed files with 291 additions and 36 deletions.
diff --git a/CHANGES b/CHANGES
@@ -5,10 +5,22 @@ CHANGE NOTES FOR 0.10.0 (STARTED Jan 24, 2020), (RELEASED: Dev -)
 ------------------------------------------------------------------------
 
 GENERAL CHANGES:
-- Implemented KD-tree graphical visualizer `kdtree_visualizer` for KD-trees with 2-dimensional data (pyclustering.container.kdtree).
+- Optimization of CURE algorithm using balanced KD-tree (Python: `pyclustering.cluster.cure`; C++: `pyclustering::clst::cure`).
+  See: https://github.com/annoviko/pyclustering/issues/589
+
+- Optimization of OPTICS algorithm using balanced KD-tree (Python: `pyclustering.cluster.optics`; C++: `pyclustering::clst::optics`).
+  See: https://github.com/annoviko/pyclustering/issues/588
+
+- Optimization of DBSCAN algorithm using balanced KD-tree (Python: `pyclustering.cluster.dbscan`; C++: `pyclustering::clst::dbscan`).
+  See: https://github.com/annoviko/pyclustering/issues/587
+
+- Implemented new optimized balanced KD-tree `kdtree_balanced` (Python: `pyclustering.cluster.kdtree`; C++: `pyclustering::container::kdtree_balanced`).
+  See: https://github.com/annoviko/pyclustering/issues/379
+
+- Implemented KD-tree graphical visualizer `kdtree_visualizer` for KD-trees with 2-dimensional data (Python: `pyclustering.container.kdtree`).
   See: https://github.com/annoviko/pyclustering/issues/586
 
-- Updated interface of each clustering algorithm in C/C++ pyclustering `cluster_data` is substituted by concrete classes (ccore.clst).
+- Updated interface of each clustering algorithm in C/C++ pyclustering `cluster_data` is substituted by concrete classes (C++ `pyclustering::clst`).
   See: https://github.com/annoviko/pyclustering/issues/577
 
 

diff --git a/ccore/include/pyclustering/container/kdtree.hpp b/ccore/include/pyclustering/container/kdtree.hpp
@@ -71,7 +71,7 @@ class kdtree : public kdtree_balanced {
 public:
     kdtree() = default;
 
-    kdtree(const dataset & p_data, const std::vector<void *> p_payloads = {});
+    kdtree(const dataset & p_data, const std::vector<void *> & p_payloads = {});
 
     kdtree(const kdtree & p_other) = default;
 

diff --git a/ccore/include/pyclustering/container/kdtree_balanced.hpp b/ccore/include/pyclustering/container/kdtree_balanced.hpp
@@ -48,14 +48,61 @@ namespace container {
 
 There is an example how to create KD-tree:
 @code
-    TODO:
+    #include <vector>
+    #include <iostream>
+
+    #include <pyclustering/container/kdtree_balanced.hpp>
+    #include <pyclustering/container/kdtree_searcher.hpp>
+
+    using namespace pyclustering;
+    using namespace pyclustering::container;
+
+    int main() {
+        // Points that should be stored in KD-tree.
+        dataset coord = { { 30, 59 },{ 5, 51 },{ 4, 52 },{ 12, 41 },{ 12, 45 } };
+
+        // Lets create payload that is associated with each point.
+        std::vector<void *> payload = { "St-Petersburg", "Eindhoven", "Amsterdam", "Rome", "Venice" };
+
+        // Create balanced KD-tree.
+        kdtree_balanced tree(coord, payload);
+
+        // Check each city in the tree.
+        for (const auto & p : coord) {
+            auto node = tree.find_node(p);
+            std::cout << p[0] << ", " << p[1] << ": "
+                << (char *)node->get_payload() << std::endl;
+        }
+
+        // Find closest cities to Eindhoven in distance 10.
+        kdtree_searcher searcher({ 5, 51 }, tree.get_root(), 10);
+
+        std::cout << "The closest city to Eindhoven is "
+            << (char *)searcher.find_nearest_node()->get_payload() << std::endl;
+
+        // Cities to which from Eidhoven less than 10.
+        std::vector<double> distances;
+        std::vector<kdnode::ptr> nodes;
+
+        searcher.find_nearest_nodes(distances, nodes);
+
+        std::cout << "Cities to which distance is less or equal to 10:" << std::endl;
+        for (std::size_t i = 0; i < nodes.size(); i++) {
+            std::cout << distances[i] << ": "
+                << (char *)nodes[i]->get_payload() << std::endl;
+        }
+
+        return 0;
+    }
 @endcode
 
 There is an illustration of balanced KD-tree above that has been done by python version of pyclustering library.
 @image html kd_tree_balanced_lsun.png "Fig. 1. Balanced KD-tree for sample 'Lsun'."
 
 Implementation based on paper @cite book::the_design_and_analysis.
 
+@see kdtree
+
 */
 class kdtree_balanced {
 protected:
@@ -81,7 +128,7 @@ class kdtree_balanced {
     @param[in] p_payloads: payload for each point in `p_data`.
 
     */
-    kdtree_balanced(const dataset & p_data, const std::vector<void *> p_payloads = { });
+    kdtree_balanced(const dataset & p_data, const std::vector<void *> & p_payloads = { });
 
     /*!
 

diff --git a/ccore/include/pyclustering/utils/algorithm.hpp b/ccore/include/pyclustering/utils/algorithm.hpp
@@ -0,0 +1,134 @@
+/*!
+
+@authors Andrei Novikov (pyclustering@yandex.ru)
+@date 2014-2020
+@copyright GNU Public License
+
+@cond GNU_PUBLIC_LICENSE
+    pyclustering is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    pyclustering is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+@endcond
+
+*/
+
+#pragma once
+
+
+#include <iterator>
+
+
+namespace pyclustering {
+
+namespace utils {
+
+namespace algorithm {
+
+
+/*!
+
+@brief Returns the element at the left side from the right border with the same value as the 
+        last element in the range `[p_begin, p_end)`.
+@details The element at the right is considered as target to search. `[p_begin, p_end)` must 
+         be sorted collection. `InputIt` must meet the requirements of `LegacyInputIterator` 
+         and `LegacyRandomAccessIterator`. The complexity of the algorithm is `O(log(n))`. The 
+         algorithm is based on the binary search algorithm.
+
+@param[in] p_begin: iterator pointing to the first element.
+@param[in] p_end: iterator pointing to the end of the range.
+@param[in] p_comparator: comparison function object which returns `true` if the first argument
+            is less than the second. The signature of the compare function should be equivalent
+            to the following: `bool comparator(const Type & p_val1, const Type & p_val2)`.
+
+@return The element at the left side from the right border with the same value as the 
+         last element in the range `[p_begin, p_end)`.
+
+*/
+template <class InputIt, class Comparator>
+InputIt find_left_element(const InputIt p_begin, const InputIt p_end, Comparator p_comparator) {
+    if (p_begin == p_end) {
+        return p_end;
+    }
+
+    InputIt left = p_begin, right = p_end - 1;
+    InputIt middle = p_begin + (std::distance(left, right) / 2);
+    auto target = *right;
+
+    while (left < right) {
+        if (p_comparator(*middle, target)) {
+            left = middle + 1;
+        }
+        else {
+            right = middle;
+        }
+
+        const auto offset = std::distance(left, right) / 2;
+        middle = left + offset;
+    }
+
+    return left;
+}
+
+
+/*!
+
+@brief Returns the element at the left side from the right border with the same value as the 
+        last element in the range `[p_begin, p_end)`.
+@details The element at the right is considered as target to search. `[p_begin, p_end)` must 
+         be sorted collection. `InputIt` must meet the requirements of `LegacyInputIterator` 
+         and `LegacyRandomAccessIterator`. The complexity of the algorithm is `O(log(n))`. The 
+         algorithm is based on the binary search algorithm.
+
+@param[in] p_begin: iterator pointing to the first element.
+@param[in] p_end: iterator pointing to the end of the range.
+
+@return The element at the left side from the right border with the same value as the 
+         last element in the range `[p_begin, p_end)`.
+
+@code
+    #include <iterator>
+    #include <vector>
+    #include <iostream>
+
+    #include <pyclustering/utils/algorithm.hpp>
+
+    using namespace pyclustering::utils::algorithm;
+
+    int main() {
+        std::vector<int> seq = { 1, 2, 2, 3, 3, 3, 6, 6, 6 };
+
+        for (auto iter = seq.begin() + 1; iter != seq.end(); iter++) {
+            auto left = find_left_element(seq.begin(), iter);
+            std::cout << "Index of the left element: " << std::distance(seq.begin(), left)
+                << " for " << *iter << std::endl;
+        }
+
+        return 0;
+    }
+@endcode
+
+*/
+template <class InputIt>
+InputIt find_left_element(const InputIt p_begin, const InputIt p_end) {
+    using iter_type = typename std::iterator_traits<InputIt>::value_type;
+
+    return find_left_element(p_begin, p_end, [](iter_type & p_val1, iter_type & p_val2) {
+        return p_val1 < p_val2;
+    });
+}
+
+
+}
+
+}
+
+}
diff --git a/ccore/src/ccore.vcxproj b/ccore/src/ccore.vcxproj
@@ -227,6 +227,7 @@
     <ClInclude Include="..\include\pyclustering\parallel\task.hpp" />
     <ClInclude Include="..\include\pyclustering\parallel\thread_executor.hpp" />
     <ClInclude Include="..\include\pyclustering\parallel\thread_pool.hpp" />
+    <ClInclude Include="..\include\pyclustering\utils\algorithm.hpp" />
     <ClInclude Include="..\include\pyclustering\utils\linalg.hpp" />
     <ClInclude Include="..\include\pyclustering\utils\math.hpp" />
     <ClInclude Include="..\include\pyclustering\utils\metric.hpp" />

diff --git a/ccore/src/ccore.vcxproj.filters b/ccore/src/ccore.vcxproj.filters
@@ -639,5 +639,8 @@
     <ClInclude Include="..\include\pyclustering\container\kdtree_balanced.hpp">
       <Filter>Header Files\container</Filter>
     </ClInclude>
+    <ClInclude Include="..\include\pyclustering\utils\algorithm.hpp">
+      <Filter>Header Files\utils</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
diff --git a/ccore/src/container/kdtree.cpp b/ccore/src/container/kdtree.cpp
@@ -32,7 +32,7 @@ namespace pyclustering {
 namespace container {
 
 
-kdtree::kdtree(const dataset & p_data, const std::vector<void *> p_payloads) :
+kdtree::kdtree(const dataset & p_data, const std::vector<void *> & p_payloads) :
     kdtree_balanced(p_data, p_payloads)
 { }
 

diff --git a/ccore/src/container/kdtree_balanced.cpp b/ccore/src/container/kdtree_balanced.cpp
@@ -22,16 +22,20 @@
 */
 
 #include <pyclustering/container/kdtree_balanced.hpp>
+#include <pyclustering/utils/algorithm.hpp>
 
 #include <algorithm>
 
 
+using namespace pyclustering::utils::algorithm;
+
+
 namespace pyclustering {
 
 namespace container {
 
 
-kdtree_balanced::kdtree_balanced(const dataset & p_data, const std::vector<void *> p_payloads) {
+kdtree_balanced::kdtree_balanced(const dataset & p_data, const std::vector<void *> & p_payloads) {
     if (p_data.empty()) { return; }
 
     std::vector<kdnode::ptr> nodes(p_data.size());
@@ -55,19 +59,17 @@ kdnode::ptr kdtree_balanced::create_tree(std::vector<kdnode::ptr>::iterator p_be
     }
 
     const std::size_t discriminator = p_depth % m_dimension;
-    int median = length / 2;
+    const int median = length / 2;
 
     std::sort(p_begin, p_end, [discriminator](const kdnode::ptr & p1, const kdnode::ptr & p2) {
         return p1->get_data()[discriminator] < p2->get_data()[discriminator];
     });
 
-   // TODO: optimize by binary search - no need to use O(n) search algorithm
-    auto median_iter = p_begin + median;
-    while ((median - 1 >= 0) && 
-        ((*(median_iter - 1))->get_data()[discriminator] == (*median_iter)->get_data()[discriminator])) {
-        median_iter--;
-        median--;
-    }
+    auto median_iter = find_left_element(p_begin, p_begin + median + 1, 
+        [discriminator](const kdnode::ptr & p1, const kdnode::ptr & p2) {
+            return p1->get_data()[discriminator] < p2->get_data()[discriminator];
+        }
+    );
 
     kdnode::ptr new_node = *median_iter;
     new_node->set_parent(p_parent);

diff --git a/ccore/tst/utcore.vcxproj b/ccore/tst/utcore.vcxproj
@@ -506,6 +506,7 @@
     <ClCompile Include="utenv_utils.cpp" />
     <ClCompile Include="utest-thread_pool.cpp" />
     <ClCompile Include="utest-ttsas.cpp" />
+    <ClCompile Include="utest-utils-algorithm.cpp" />
     <ClCompile Include="utest-utils-metric.cpp" />
     <ClCompile Include="utest-xmeans.cpp" />
   </ItemGroup>

diff --git a/ccore/tst/utcore.vcxproj.filters b/ccore/tst/utcore.vcxproj.filters
@@ -515,6 +515,9 @@
     <ClCompile Include="..\src\container\kdtree_balanced.cpp">
       <Filter>Tested Code\container</Filter>
     </ClCompile>
+    <ClCompile Include="utest-utils-algorithm.cpp">
+      <Filter>Unit Tests</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="samples.hpp">

diff --git a/ccore/tst/utest-kdtree.cpp b/ccore/tst/utest-kdtree.cpp
@@ -289,7 +289,7 @@ TEST_F(utest_kdtree, insert_remove_node) {
         }
     }
 
-    ASSERT_EQ(tree.get_size(), 0);
+    ASSERT_EQ(tree.get_size(), static_cast<std::size_t>(0));
 }
 
 

diff --git a/ccore/tst/utest-utils-algorithm.cpp b/ccore/tst/utest-utils-algorithm.cpp
@@ -0,0 +1,51 @@
+/*!
+
+@authors Andrei Novikov (pyclustering@yandex.ru)
+@date 2014-2020
+@copyright GNU Public License
+
+@cond GNU_PUBLIC_LICENSE
+    pyclustering is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    pyclustering is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+@endcond
+
+*/
+
+
+#include <gtest/gtest.h>
+
+#include <pyclustering/utils/algorithm.hpp>
+
+
+using namespace pyclustering::utils::algorithm;
+
+
+TEST(utest_algorithm, find_left_element_valid_input) {
+    std::vector<int> values = { 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4 };
+    std::vector<int> answer = { 0, 0, 2, 2, 2, 5, 5, 5, 5, 9, 9, 9, 12 };
+
+    for (std::size_t i = 0; i < values.size(); i++) {
+        auto iter = find_left_element(values.begin(), values.begin() + i + 1);
+
+        auto actual_index = std::distance(values.begin(), iter);
+        auto expected_index = answer[i];
+
+        ASSERT_EQ(expected_index, actual_index);
+    }
+}
+
+TEST(utest_algorithm, find_left_element_invalid_input) {
+    std::vector<int> values = { 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4 };
+    auto iter = find_left_element(values.begin(), values.begin());
+    ASSERT_EQ(values.begin(), iter);
+}