Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 225 additions & 0 deletions include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP

#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class Iterator, class Compare>
inline void merge_blocks(Iterator first,
Iterator result,
Compare compare,
size_t count,
const size_t block_size,
command_queue &queue)
{
meta_kernel k("merge_sort_on_cpu_merge_blocks");
size_t count_arg = k.add_arg<const uint_>("count");
size_t block_size_arg = k.add_arg<uint_>("block_size");

k <<
k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
k.decl<uint_>("result_idx") << " = b1_start;\n" <<

// merging block 1 and block 2 (stable)
"while(b1_start < b1_end && b2_start < b2_end){\n" <<
" if( " << compare(first[k.var<uint_>("b2_start")],
first[k.var<uint_>("b1_start")]) << "){\n" <<
" " << result[k.var<uint_>("result_idx")] << " = " <<
first[k.var<uint_>("b2_start")] << ";\n" <<
" b2_start++;\n" <<
" }\n" <<
" else {\n" <<
" " << result[k.var<uint_>("result_idx")] << " = " <<
first[k.var<uint_>("b1_start")] << ";\n" <<
" b1_start++;\n" <<
" }\n" <<
" result_idx++;\n" <<
"}\n" <<
"while(b1_start < b1_end){\n" <<
" " << result[k.var<uint_>("result_idx")] << " = " <<
first[k.var<uint_>("b1_start")] << ";\n" <<
" b1_start++;\n" <<
" result_idx++;\n" <<
"}\n" <<
"while(b2_start < b2_end){\n" <<
" " << result[k.var<uint_>("result_idx")] << " = " <<
first[k.var<uint_>("b2_start")] << ";\n" <<
" b2_start++;\n" <<
" result_idx++;\n" <<
"}\n";

const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<const uint_>(count));
kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));

const size_t global_size = static_cast<size_t>(
std::ceil(float(count) / (2 * block_size))
);
queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
}

template<class Iterator, class Compare>
inline void dispatch_merge_blocks(Iterator first,
Iterator result,
Compare compare,
size_t count,
const size_t block_size,
const size_t input_size_threshold,
const size_t blocks_no_threshold,
command_queue &queue)
{
const size_t blocks_no = static_cast<size_t>(
std::ceil(float(count) / block_size)
);
// merge with merge path should used only for the large arrays and at the
// end of merging part when there are only a few big blocks left to be merged
if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
Iterator last = first + count;
for(size_t i = 0; i < count; i+= 2*block_size)
{
Iterator first1 = (std::min)(first + i, last);
Iterator last1 = (std::min)(first1 + block_size, last);
Iterator first2 = last1;
Iterator last2 = (std::min)(first2 + block_size, last);
Iterator block_result = (std::min)(result + i, result + count);
merge_with_merge_path(first1, last1, first2, last2,
block_result, compare, queue);
}
}
else {
merge_blocks(first, result, compare, count, block_size, queue);
}
}

template<class Iterator, class Compare>
inline void block_insertion_sort(Iterator first,
Compare compare,
const size_t count,
const size_t block_size,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type T;

meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
size_t count_arg = k.add_arg<uint_>("count");
size_t block_size_arg = k.add_arg<uint_>("block_size");

k <<
k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<

// block insertion sort (stable)
"for(uint i = start+1; i < end; i++){\n" <<
" " << k.decl<const T>("value") << " = " << first[k.var<uint_>("i")] << ";\n" <<
" uint pos = i;\n" <<
" while(pos > start && " <<
compare(k.var<const T>("value"),
first[k.var<uint_>("pos-1")]) << "){\n" <<
" " << first[k.var<uint_>("pos")] << " = " << first[k.var<uint_>("pos-1")] << ";\n" <<
" pos--;\n" <<
" }\n" <<
" " << first[k.var<uint_>("pos")] << " = value;\n" <<
"}\n"; // block insertion sort

const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));

const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
}

// This sort is stable.
template<class Iterator, class Compare>
inline void merge_sort_on_cpu(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;

size_t count = iterator_range_size(first, last);
if(count < 2){
return;
}
// for small input size only insertion sort is performed
else if(count <= 512){
block_insertion_sort(first, compare, count, count, queue);
return;
}

const context &context = queue.get_context();
const device &device = queue.get_device();

// loading parameters
std::string cache_key =
std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);

// When there is merge_with_path_blocks_no_threshold or less blocks left to
// merge AND input size is merge_with_merge_path_input_size_threshold or more
// merge_with_merge_path() algorithm is used to merge sorted blocks;
// otherwise merge_blocks() is used.
const size_t merge_with_path_blocks_no_threshold =
parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
const size_t merge_with_path_input_size_threshold =
parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);

const size_t block_size =
parameters->get(cache_key, "insertion_sort_block_size", 64);
block_insertion_sort(first, compare, count, block_size, queue);

// temporary buffer for merge result
vector<value_type> temp(count, context);
bool result_in_temporary_buffer = false;

for(size_t i = block_size; i < count; i *= 2){
result_in_temporary_buffer = !result_in_temporary_buffer;
if(result_in_temporary_buffer) {
dispatch_merge_blocks(first, temp.begin(), compare, count, i,
merge_with_path_input_size_threshold,
merge_with_path_blocks_no_threshold,
queue);
} else {
dispatch_merge_blocks(temp.begin(), first, compare, count, i,
merge_with_path_input_size_threshold,
merge_with_path_blocks_no_threshold,
queue);
}
}

if(result_in_temporary_buffer) {
copy(temp.begin(), temp.end(), first, queue);
}
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
45 changes: 25 additions & 20 deletions include/boost/compute/algorithm/sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/reverse.hpp>
Expand All @@ -30,13 +31,13 @@ namespace compute {
namespace detail {

template<class T>
inline void dispatch_device_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
less<T>,
command_queue &queue,
typename boost::enable_if_c<
is_radix_sortable<T>::value
>::type* = 0)
inline void dispatch_gpu_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
less<T>,
command_queue &queue,
typename boost::enable_if_c<
is_radix_sortable<T>::value
>::type* = 0)
{
size_t count = detail::iterator_range_size(first, last);

Expand All @@ -53,13 +54,13 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
}

template<class T>
inline void dispatch_device_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
greater<T> compare,
command_queue &queue,
typename boost::enable_if_c<
is_radix_sortable<T>::value
>::type* = 0)
inline void dispatch_gpu_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
greater<T> compare,
command_queue &queue,
typename boost::enable_if_c<
is_radix_sortable<T>::value
>::type* = 0)
{
size_t count = detail::iterator_range_size(first, last);

Expand All @@ -82,10 +83,10 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
}

template<class Iterator, class Compare>
inline void dispatch_device_sort(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
inline void dispatch_gpu_sort(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
::boost::compute::detail::serial_insertion_sort(
first, last, compare, queue
Expand All @@ -102,7 +103,11 @@ inline void dispatch_sort(Iterator first,
is_device_iterator<Iterator>
>::type* = 0)
{
dispatch_device_sort(first, last, compare, queue);
if(queue.get_device().type() & device::gpu) {
dispatch_gpu_sort(first, last, compare, queue);
return;
}
::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
}

// sort() for host iterators
Expand All @@ -125,7 +130,7 @@ inline void dispatch_sort(Iterator first,
);

// sort mapped buffer
dispatch_device_sort(view.begin(), view.end(), compare, queue);
dispatch_sort(view.begin(), view.end(), compare, queue);

// return results to host
view.map(queue);
Expand Down
34 changes: 19 additions & 15 deletions include/boost/compute/algorithm/stable_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/reverse.hpp>
Expand All @@ -25,10 +26,10 @@ namespace compute {
namespace detail {

template<class Iterator, class Compare>
inline void dispatch_stable_sort(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
inline void dispatch_gpu_stable_sort(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
::boost::compute::detail::serial_insertion_sort(
first, last, compare, queue
Expand All @@ -37,20 +38,20 @@ inline void dispatch_stable_sort(Iterator first,

template<class T>
inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
dispatch_stable_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
less<T>,
command_queue &queue)
dispatch_gpu_stable_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
less<T>,
command_queue &queue)
{
::boost::compute::detail::radix_sort(first, last, queue);
}

template<class T>
inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
dispatch_stable_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
greater<T>,
command_queue &queue)
dispatch_gpu_stable_sort(buffer_iterator<T> first,
buffer_iterator<T> last,
greater<T>,
command_queue &queue)
{
// radix sort in ascending order
::boost::compute::detail::radix_sort(first, last, queue);
Expand All @@ -71,9 +72,12 @@ inline void stable_sort(Iterator first,
Compare compare,
command_queue &queue = system::default_queue())
{
::boost::compute::detail::dispatch_stable_sort(
first, last, compare, queue
);
if(queue.get_device().type() & device::gpu) {
::boost::compute::detail::dispatch_gpu_stable_sort(
first, last, compare, queue
);
}
::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
}

/// \overload
Expand Down
Loading