From 5a9b30176da0167d80f9614fd8034c0d1cf029e8 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 25 Jul 2015 13:36:40 +0200 Subject: [PATCH 1/3] Add more tests for sort --- test/test_sort.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/test/test_sort.cpp b/test/test_sort.cpp index ce76e1d48..0ee58c016 100644 --- a/test/test_sort.cpp +++ b/test/test_sort.cpp @@ -15,6 +15,19 @@ #include #include #include +#include + +struct Particle +{ + Particle(): x(0.f), y(0.f) { } + Particle(float _x, float _y): x(_x), y(_y) { } + + float x; + float y; +}; + +// adapt struct for OpenCL +BOOST_COMPUTE_ADAPT_STRUCT(Particle, Particle, (x, y)) #include "check_macros.hpp" #include "context_setup.hpp" @@ -277,4 +290,74 @@ BOOST_AUTO_TEST_CASE(sort_host_vector) CHECK_RANGE_EQUAL(int, 8, vector, (0, 1, 2, 3, 4, 5, 6, 7)); } +BOOST_AUTO_TEST_CASE(sort_custom_struct) +{ + // function to compare particles by their x-coordinate + BOOST_COMPUTE_FUNCTION(bool, sort_by_x, (Particle a, Particle b), + { + return a.x < b.x; + }); + + std::vector particles; + particles.push_back(Particle(0.1f, 0.f)); + particles.push_back(Particle(-0.4f, 0.f)); + particles.push_back(Particle(10.0f, 0.f)); + particles.push_back(Particle(0.001f, 0.f)); + + boost::compute::vector vector(4, context); + boost::compute::copy(particles.begin(), particles.end(), vector.begin(), queue); + BOOST_CHECK_EQUAL(vector.size(), size_t(4)); + BOOST_CHECK( + boost::compute::is_sorted(vector.begin(), vector.end(), + sort_by_x, queue) == false + ); + + boost::compute::sort(vector.begin(), vector.end(), sort_by_x, queue); + BOOST_CHECK( + boost::compute::is_sorted(vector.begin(), vector.end(), + sort_by_x, queue) == true + ); + boost::compute::copy(vector.begin(), vector.end(), particles.begin(), queue); + BOOST_CHECK_CLOSE(particles[0].x, -0.4f, 0.1); + BOOST_CHECK_CLOSE(particles[1].x, 0.001f, 0.1); + BOOST_CHECK_CLOSE(particles[2].x, 0.1f, 0.1); + BOOST_CHECK_CLOSE(particles[3].x, 10.0f, 0.1); +} + +BOOST_AUTO_TEST_CASE(sort_int2) +{ + using bc::int2_; + + BOOST_COMPUTE_FUNCTION(bool, sort_int2, (int2_ a, int2_ b), + { + return a.x < b.x; + }); + + const size_t size = 100; + std::vector host(size, int2_(0, 0)); + host[0] = int2_(100.f, 0.f); + host[size/4] = int2_(20.f, 0.f); + host[(size*3)/4] = int2_(9.f, 0.f); + host[size-3] = int2_(-10.0f, 0.f); + + boost::compute::vector vector(size, context); + boost::compute::copy(host.begin(), host.end(), vector.begin(), queue); + BOOST_CHECK_EQUAL(vector.size(), size); + BOOST_CHECK( + boost::compute::is_sorted(vector.begin(), vector.end(), + sort_int2, queue) == false + ); + + boost::compute::sort(vector.begin(), vector.end(), sort_int2, queue); + BOOST_CHECK( + boost::compute::is_sorted(vector.begin(), vector.end(), + sort_int2, queue) == true + ); + boost::compute::copy(vector.begin(), vector.end(), host.begin(), queue); + BOOST_CHECK_CLOSE(host[0][0], -10.f, 0.1); + BOOST_CHECK_CLOSE(host[(size - 3)][0], 9.f, 0.1); + BOOST_CHECK_CLOSE(host[(size - 2)][0], 20.f, 0.1); + BOOST_CHECK_CLOSE(host[(size - 1)][0], 100.f, 0.1); +} + BOOST_AUTO_TEST_SUITE_END() From b5bef712cdcb5e2037da3772c66d86ad3ced50c9 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 28 Jul 2015 18:37:11 +0200 Subject: [PATCH 2/3] Merge sort for CPU devices --- .../algorithm/detail/merge_sort_on_cpu.hpp | 176 ++++++++++++++++++ include/boost/compute/algorithm/sort.hpp | 45 +++-- .../boost/compute/algorithm/stable_sort.hpp | 34 ++-- 3 files changed, 220 insertions(+), 35 deletions(-) create mode 100644 include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp new file mode 100644 index 000000000..0af6eae92 --- /dev/null +++ b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp @@ -0,0 +1,176 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP + +#include +#include +#include +#include +#include +#include + +namespace boost { +namespace compute { +namespace detail { + +template +inline void merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + command_queue &queue) +{ + meta_kernel k("merge_sort_on_cpu_merge_blocks"); + size_t count_arg = k.add_arg("count"); + size_t block_size_arg = k.add_arg("block_size"); + + k << + k.decl("b1_start") << " = get_global_id(0) * block_size * 2;\n" << + k.decl("b1_end") << " = min(count, b1_start + block_size);\n" << + k.decl("b2_start") << " = min(count, b1_start + block_size);\n" << + k.decl("b2_end") << " = min(count, b2_start + block_size);\n" << + k.decl("result_idx") << " = b1_start;\n" << + + // merging block 1 and block 2 (stable) + "while(b1_start < b1_end && b2_start < b2_end){\n" << + " if( " << compare(first[k.var("b2_start")], + first[k.var("b1_start")]) << "){\n" << + " " << result[k.var("result_idx")] << " = " << + first[k.var("b2_start")] << ";\n" << + " b2_start++;\n" << + " }\n" << + " else {\n" << + " " << result[k.var("result_idx")] << " = " << + first[k.var("b1_start")] << ";\n" << + " b1_start++;\n" << + " }\n" << + " result_idx++;\n" << + "}\n" << + "while(b1_start < b1_end){\n" << + " " << result[k.var("result_idx")] << " = " << + first[k.var("b1_start")] << ";\n" << + " b1_start++;\n" << + " result_idx++;\n" << + "}\n" << + "while(b2_start < b2_end){\n" << + " " << result[k.var("result_idx")] << " = " << + first[k.var("b2_start")] << ";\n" << + " b2_start++;\n" << + " result_idx++;\n" << + "}\n"; + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast(count)); + kernel.set_arg(block_size_arg, static_cast(block_size)); + + const size_t global_size = static_cast( + std::ceil(float(count) / (2 * block_size)) + ); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template +inline void block_insertion_sort(Iterator first, + Compare compare, + const size_t count, + const size_t block_size, + command_queue &queue) +{ + typedef typename std::iterator_traits::value_type T; + + meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); + size_t count_arg = k.add_arg("count"); + size_t block_size_arg = k.add_arg("block_size"); + + k << + k.decl("start") << " = get_global_id(0) * block_size;\n" << + k.decl("end") << " = min(count, start + block_size);\n" << + + // block insertion sort (stable) + "for(uint i = start+1; i < end; i++){\n" << + " " << k.decl("value") << " = " << first[k.var("i")] << ";\n" << + " uint pos = i;\n" << + " while(pos > start && " << + compare(k.var("value"), + first[k.var("pos-1")]) << "){\n" << + " " << first[k.var("pos")] << " = " << first[k.var("pos-1")] << ";\n" << + " pos--;\n" << + " }\n" << + " " << first[k.var("pos")] << " = value;\n" << + "}\n"; // block insertion sort + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast(count)); + kernel.set_arg(block_size_arg, static_cast(block_size)); + + const size_t global_size = static_cast(std::ceil(float(count) / block_size)); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template +inline void merge_sort_on_cpu(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits::value_type value_type; + + size_t count = iterator_range_size(first, last); + if(count < 2){ + return; + } + // for small input size only insertion sort is performed + else if(count <= 512){ + block_insertion_sort(first, compare, count, count, queue); + return; + } + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_merge_sort_on_cpu_") + type_name(); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + const size_t block_size = + parameters->get(cache_key, "insertion_sort_block_size", 64); + block_insertion_sort(first, compare, count, block_size, queue); + + // temporary buffer for merge result + vector temp(count, context); + bool result_in_temp = false; + + for(size_t i = block_size; i < count; i *= 2){ + result_in_temp = !result_in_temp; + if(result_in_temp) { + merge_blocks(first, temp.begin(), compare, count, i, queue); + } else { + merge_blocks(temp.begin(), first, compare, count, i, queue); + } + } + + // if the result is in temp buffer we need to copy it to input + if(result_in_temp) { + copy(temp.begin(), temp.end(), first, queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP diff --git a/include/boost/compute/algorithm/sort.hpp b/include/boost/compute/algorithm/sort.hpp index bed01c8d8..b2730b3e2 100644 --- a/include/boost/compute/algorithm/sort.hpp +++ b/include/boost/compute/algorithm/sort.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -30,13 +31,13 @@ namespace compute { namespace detail { template -inline void dispatch_device_sort(buffer_iterator first, - buffer_iterator last, - less, - command_queue &queue, - typename boost::enable_if_c< - is_radix_sortable::value - >::type* = 0) +inline void dispatch_gpu_sort(buffer_iterator first, + buffer_iterator last, + less, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable::value + >::type* = 0) { size_t count = detail::iterator_range_size(first, last); @@ -53,13 +54,13 @@ inline void dispatch_device_sort(buffer_iterator first, } template -inline void dispatch_device_sort(buffer_iterator first, - buffer_iterator last, - greater compare, - command_queue &queue, - typename boost::enable_if_c< - is_radix_sortable::value - >::type* = 0) +inline void dispatch_gpu_sort(buffer_iterator first, + buffer_iterator last, + greater compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable::value + >::type* = 0) { size_t count = detail::iterator_range_size(first, last); @@ -82,10 +83,10 @@ inline void dispatch_device_sort(buffer_iterator first, } template -inline void dispatch_device_sort(Iterator first, - Iterator last, - Compare compare, - command_queue &queue) +inline void dispatch_gpu_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) { ::boost::compute::detail::serial_insertion_sort( first, last, compare, queue @@ -102,7 +103,11 @@ inline void dispatch_sort(Iterator first, is_device_iterator >::type* = 0) { - dispatch_device_sort(first, last, compare, queue); + if(queue.get_device().type() & device::gpu) { + dispatch_gpu_sort(first, last, compare, queue); + return; + } + ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); } // sort() for host iterators @@ -125,7 +130,7 @@ inline void dispatch_sort(Iterator first, ); // sort mapped buffer - dispatch_device_sort(view.begin(), view.end(), compare, queue); + dispatch_sort(view.begin(), view.end(), compare, queue); // return results to host view.map(queue); diff --git a/include/boost/compute/algorithm/stable_sort.hpp b/include/boost/compute/algorithm/stable_sort.hpp index 71769c0e0..cd82a0a60 100644 --- a/include/boost/compute/algorithm/stable_sort.hpp +++ b/include/boost/compute/algorithm/stable_sort.hpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -25,10 +26,10 @@ namespace compute { namespace detail { template -inline void dispatch_stable_sort(Iterator first, - Iterator last, - Compare compare, - command_queue &queue) +inline void dispatch_gpu_stable_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) { ::boost::compute::detail::serial_insertion_sort( first, last, compare, queue @@ -37,20 +38,20 @@ inline void dispatch_stable_sort(Iterator first, template inline typename boost::enable_if_c::value>::type -dispatch_stable_sort(buffer_iterator first, - buffer_iterator last, - less, - command_queue &queue) +dispatch_gpu_stable_sort(buffer_iterator first, + buffer_iterator last, + less, + command_queue &queue) { ::boost::compute::detail::radix_sort(first, last, queue); } template inline typename boost::enable_if_c::value>::type -dispatch_stable_sort(buffer_iterator first, - buffer_iterator last, - greater, - command_queue &queue) +dispatch_gpu_stable_sort(buffer_iterator first, + buffer_iterator last, + greater, + command_queue &queue) { // radix sort in ascending order ::boost::compute::detail::radix_sort(first, last, queue); @@ -71,9 +72,12 @@ inline void stable_sort(Iterator first, Compare compare, command_queue &queue = system::default_queue()) { - ::boost::compute::detail::dispatch_stable_sort( - first, last, compare, queue - ); + if(queue.get_device().type() & device::gpu) { + ::boost::compute::detail::dispatch_gpu_stable_sort( + first, last, compare, queue + ); + } + ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); } /// \overload From b6c2e71ba3fdccd0658e9d2fd8ff5edfbcec56b0 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 30 Jul 2015 18:52:52 +0200 Subject: [PATCH 3/3] Using merge with merge path algorithm in merge sort Merge with merge path is used in merge part of merge sort algorithm as it's seems to be more efficient than block merge when there are a few big sorted blocks left to be merged. --- .../algorithm/detail/merge_sort_on_cpu.hpp | 63 ++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp index 0af6eae92..a58104a54 100644 --- a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp +++ b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,39 @@ inline void merge_blocks(Iterator first, queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } +template +inline void dispatch_merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const size_t input_size_threshold, + const size_t blocks_no_threshold, + command_queue &queue) +{ + const size_t blocks_no = static_cast( + std::ceil(float(count) / block_size) + ); + // merge with merge path should used only for the large arrays and at the + // end of merging part when there are only a few big blocks left to be merged + if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ + Iterator last = first + count; + for(size_t i = 0; i < count; i+= 2*block_size) + { + Iterator first1 = (std::min)(first + i, last); + Iterator last1 = (std::min)(first1 + block_size, last); + Iterator first2 = last1; + Iterator last2 = (std::min)(first2 + block_size, last); + Iterator block_result = (std::min)(result + i, result + count); + merge_with_merge_path(first1, last1, first2, last2, + block_result, compare, queue); + } + } + else { + merge_blocks(first, result, compare, count, block_size, queue); + } +} + template inline void block_insertion_sort(Iterator first, Compare compare, @@ -119,6 +153,7 @@ inline void block_insertion_sort(Iterator first, queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } +// This sort is stable. template inline void merge_sort_on_cpu(Iterator first, Iterator last, @@ -146,25 +181,39 @@ inline void merge_sort_on_cpu(Iterator first, boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); + // When there is merge_with_path_blocks_no_threshold or less blocks left to + // merge AND input size is merge_with_merge_path_input_size_threshold or more + // merge_with_merge_path() algorithm is used to merge sorted blocks; + // otherwise merge_blocks() is used. + const size_t merge_with_path_blocks_no_threshold = + parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); + const size_t merge_with_path_input_size_threshold = + parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); + const size_t block_size = parameters->get(cache_key, "insertion_sort_block_size", 64); block_insertion_sort(first, compare, count, block_size, queue); // temporary buffer for merge result vector temp(count, context); - bool result_in_temp = false; + bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ - result_in_temp = !result_in_temp; - if(result_in_temp) { - merge_blocks(first, temp.begin(), compare, count, i, queue); + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + dispatch_merge_blocks(first, temp.begin(), compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); } else { - merge_blocks(temp.begin(), first, compare, count, i, queue); + dispatch_merge_blocks(temp.begin(), first, compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); } } - // if the result is in temp buffer we need to copy it to input - if(result_in_temp) { + if(result_in_temporary_buffer) { copy(temp.begin(), temp.end(), first, queue); } }