boostorg · kylelutz · Aug 1, 2015 · Jul 25, 2015 · Jul 28, 2015 · Jul 30, 2015
diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
@@ -0,0 +1,225 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class Compare>
+inline void merge_blocks(Iterator first,
+                         Iterator result,
+                         Compare compare,
+                         size_t count,
+                         const size_t block_size,
+                         command_queue &queue)
+{
+    meta_kernel k("merge_sort_on_cpu_merge_blocks");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
+        k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
+        k.decl<uint_>("result_idx") << " = b1_start;\n" <<
+
+        // merging block 1 and block 2 (stable)
+        "while(b1_start < b1_end && b2_start < b2_end){\n" <<
+        "    if( " << compare(first[k.var<uint_>("b2_start")],
+                              first[k.var<uint_>("b1_start")]) << "){\n" <<
+        "        " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                      first[k.var<uint_>("b2_start")] << ";\n" <<
+        "        b2_start++;\n" <<
+        "    }\n" <<
+        "    else {\n" <<
+        "        " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                      first[k.var<uint_>("b1_start")] << ";\n" <<
+        "        b1_start++;\n" <<
+        "    }\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b1_start < b1_end){\n" <<
+        "   " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                 first[k.var<uint_>("b1_start")] << ";\n" <<
+        "    b1_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b2_start < b2_end){\n" <<
+        "   " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                 first[k.var<uint_>("b2_start")] << ";\n" <<
+        "    b2_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n";
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<const uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(
+        std::ceil(float(count) / (2 * block_size))
+    );
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void dispatch_merge_blocks(Iterator first,
+                                  Iterator result,
+                                  Compare compare,
+                                  size_t count,
+                                  const size_t block_size,
+                                  const size_t input_size_threshold,
+                                  const size_t blocks_no_threshold,
+                                  command_queue &queue)
+{
+    const size_t blocks_no = static_cast<size_t>(
+        std::ceil(float(count) / block_size)
+    );
+    // merge with merge path should used only for the large arrays and at the
+    // end of merging part when there are only a few big blocks left to be merged
+    if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
+        Iterator last = first + count;
+        for(size_t i = 0; i < count; i+= 2*block_size)
+        {
+            Iterator first1 = (std::min)(first + i, last);
+            Iterator last1 = (std::min)(first1 + block_size, last);
+            Iterator first2 = last1;
+            Iterator last2 = (std::min)(first2 + block_size, last);
+            Iterator block_result = (std::min)(result + i, result + count);
+            merge_with_merge_path(first1, last1, first2, last2,
+                                  block_result, compare, queue);
+        }
+    }
+    else {
+        merge_blocks(first, result, compare, count, block_size, queue);
+    }
+}
+
+template<class Iterator, class Compare>
+inline void block_insertion_sort(Iterator first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const size_t block_size,
+                                 command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
+        k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
+
+        // block insertion sort (stable)
+        "for(uint i = start+1; i < end; i++){\n" <<
+        "    " << k.decl<const T>("value") << " = " << first[k.var<uint_>("i")] << ";\n" <<
+        "    uint pos = i;\n" <<
+        "    while(pos > start && " <<
+                   compare(k.var<const T>("value"),
+                           first[k.var<uint_>("pos-1")]) << "){\n" <<
+        "        " << first[k.var<uint_>("pos")] << " = " << first[k.var<uint_>("pos-1")] << ";\n" <<
+        "        pos--;\n" <<
+        "    }\n" <<
+        "    " << first[k.var<uint_>("pos")] << " = value;\n" <<
+        "}\n"; // block insertion sort
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+// This sort is stable.
+template<class Iterator, class Compare>
+inline void merge_sort_on_cpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+    // for small input size only insertion sort is performed
+    else if(count <= 512){
+        block_insertion_sort(first, compare, count, count, queue);
+        return;
+    }
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // When there is merge_with_path_blocks_no_threshold or less blocks left to
+    // merge AND input size is merge_with_merge_path_input_size_threshold or more
+    // merge_with_merge_path() algorithm is used to merge sorted blocks;
+    // otherwise merge_blocks() is used.
+    const size_t merge_with_path_blocks_no_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
+    const size_t merge_with_path_input_size_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
+
+    const size_t block_size =
+        parameters->get(cache_key, "insertion_sort_block_size", 64);
+    block_insertion_sort(first, compare, count, block_size, queue);
+
+    // temporary buffer for merge result
+    vector<value_type> temp(count, context);
+    bool result_in_temporary_buffer = false;
+
+    for(size_t i = block_size; i < count; i *= 2){
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            dispatch_merge_blocks(first, temp.begin(), compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
+        } else {
+            dispatch_merge_blocks(temp.begin(), first, compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy(temp.begin(), temp.end(), first, queue);
+    }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
diff --git a/include/boost/compute/algorithm/sort.hpp b/include/boost/compute/algorithm/sort.hpp
@@ -17,6 +17,7 @@
 
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -30,13 +31,13 @@ namespace compute {
 namespace detail {
 
 template<class T>
-inline void dispatch_device_sort(buffer_iterator<T> first,
-                                 buffer_iterator<T> last,
-                                 less<T>,
-                                 command_queue &queue,
-                                 typename boost::enable_if_c<
-                                     is_radix_sortable<T>::value
-                                 >::type* = 0)
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+                              buffer_iterator<T> last,
+                              less<T>,
+                              command_queue &queue,
+                              typename boost::enable_if_c<
+                                  is_radix_sortable<T>::value
+                              >::type* = 0)
 {
     size_t count = detail::iterator_range_size(first, last);
 
@@ -53,13 +54,13 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
 }
 
 template<class T>
-inline void dispatch_device_sort(buffer_iterator<T> first,
-                                 buffer_iterator<T> last,
-                                 greater<T> compare,
-                                 command_queue &queue,
-                                 typename boost::enable_if_c<
-                                     is_radix_sortable<T>::value
-                                 >::type* = 0)
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+                              buffer_iterator<T> last,
+                              greater<T> compare,
+                              command_queue &queue,
+                              typename boost::enable_if_c<
+                                  is_radix_sortable<T>::value
+                              >::type* = 0)
 {
     size_t count = detail::iterator_range_size(first, last);
 
@@ -82,10 +83,10 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
 }
 
 template<class Iterator, class Compare>
-inline void dispatch_device_sort(Iterator first,
-                                 Iterator last,
-                                 Compare compare,
-                                 command_queue &queue)
+inline void dispatch_gpu_sort(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
 {
     ::boost::compute::detail::serial_insertion_sort(
         first, last, compare, queue
@@ -102,7 +103,11 @@ inline void dispatch_sort(Iterator first,
                               is_device_iterator<Iterator>
                           >::type* = 0)
 {
-    dispatch_device_sort(first, last, compare, queue);
+    if(queue.get_device().type() & device::gpu) {
+        dispatch_gpu_sort(first, last, compare, queue);
+        return;
+    }
+    ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
 }
 
 // sort() for host iterators
@@ -125,7 +130,7 @@ inline void dispatch_sort(Iterator first,
     );
 
     // sort mapped buffer
-    dispatch_device_sort(view.begin(), view.end(), compare, queue);
+    dispatch_sort(view.begin(), view.end(), compare, queue);
 
     // return results to host
     view.map(queue);

diff --git a/include/boost/compute/algorithm/stable_sort.hpp b/include/boost/compute/algorithm/stable_sort.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -25,10 +26,10 @@ namespace compute {
 namespace detail {
 
 template<class Iterator, class Compare>
-inline void dispatch_stable_sort(Iterator first,
-                                 Iterator last,
-                                 Compare compare,
-                                 command_queue &queue)
+inline void dispatch_gpu_stable_sort(Iterator first,
+                                     Iterator last,
+                                     Compare compare,
+                                     command_queue &queue)
 {
     ::boost::compute::detail::serial_insertion_sort(
         first, last, compare, queue
@@ -37,20 +38,20 @@ inline void dispatch_stable_sort(Iterator first,
 
 template<class T>
 inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
-dispatch_stable_sort(buffer_iterator<T> first,
-                     buffer_iterator<T> last,
-                     less<T>,
-                     command_queue &queue)
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+                         buffer_iterator<T> last,
+                         less<T>,
+                         command_queue &queue)
 {
     ::boost::compute::detail::radix_sort(first, last, queue);
 }
 
 template<class T>
 inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
-dispatch_stable_sort(buffer_iterator<T> first,
-                     buffer_iterator<T> last,
-                     greater<T>,
-                     command_queue &queue)
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+                         buffer_iterator<T> last,
+                         greater<T>,
+                         command_queue &queue)
 {
     // radix sort in ascending order
     ::boost::compute::detail::radix_sort(first, last, queue);
@@ -71,9 +72,12 @@ inline void stable_sort(Iterator first,
                         Compare compare,
                         command_queue &queue = system::default_queue())
 {
-    ::boost::compute::detail::dispatch_stable_sort(
-        first, last, compare, queue
-    );
+    if(queue.get_device().type() & device::gpu) {
+        ::boost::compute::detail::dispatch_gpu_stable_sort(
+            first, last, compare, queue
+        );
+    }
+    ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
 }
 
 /// \overload