Skip to content

Commit

Permalink
Improve sort_by_key() performance
Browse files Browse the repository at this point in the history
  • Loading branch information
kylelutz committed Mar 13, 2014
1 parent cf8e972 commit bae7432
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 11 deletions.
67 changes: 57 additions & 10 deletions include/boost/compute/algorithm/detail/radix_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,13 @@ const char radix_sort_source[] =
" const uint low_bit,\n"
" __global const uint *counts,\n"
" __global const uint *global_offsets,\n"
"#ifndef SORT_BY_KEY\n"
" __global T *output)\n"
"#else\n"
" __global T *keys_output,\n"
" __global T2 *values_input,\n"
" __global T2 *values_output)\n"
"#endif\n"
"{\n"
// work-item parameters
" const uint gid = get_global_id(0);\n"
Expand Down Expand Up @@ -172,21 +178,24 @@ const char radix_sort_source[] =
" local_offset++;\n"
" }\n"

"#ifndef SORT_BY_KEY\n"
// write value to output
" output[offset + local_offset] = value;\n"
"#else\n"
// write key and value if doing sort_by_key
" keys_output[offset + local_offset] = value;\n"
" values_output[offset + local_offset] = values_input[gid];\n"
"#endif\n"
"}\n";

template<class Iterator>
inline void radix_sort(Iterator first,
Iterator last,
command_queue &queue)
template<class T, class T2>
inline void radix_sort_impl(const buffer_iterator<T> first,
const buffer_iterator<T> last,
const buffer_iterator<T2> values_first,
command_queue &queue)
{
typedef typename
std::iterator_traits<Iterator>::value_type
value_type;
typedef typename
radix_sort_value_type<sizeof(value_type)>::type
sort_type;
typedef T value_type;
typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;

const context &context = queue.get_context();

Expand All @@ -205,10 +214,18 @@ inline void radix_sort(Iterator first,
block_count++;
}

// if we have a valid values iterator then we are doing a
// sort by key and have to set up the values buffer
bool sort_by_key = (values_first.get_buffer().get() != 0);

// load (or create) radix sort program
std::string cache_key =
std::string("radix_sort_") + type_name<value_type>();

if(sort_by_key){
cache_key += std::string("_with_") + type_name<T2>();
}

program radix_sort_program = cache->get(cache_key);

if(!radix_sort_program.get()){
Expand All @@ -225,6 +242,11 @@ inline void radix_sort(Iterator first,
options << " -DIS_SIGNED";
}

if(sort_by_key){
options << " -DSORT_BY_KEY";
options << " -DT2=" << type_name<T2>();
}

radix_sort_program =
program::build_with_source(radix_sort_source, context, options.str());

Expand All @@ -237,11 +259,14 @@ inline void radix_sort(Iterator first,

// setup temporary buffers
vector<value_type> output(count, context);
vector<T2> values_output(sort_by_key ? count : 0, context);
vector<uint_> offsets(k2, context);
vector<uint_> counts(block_count * k2, context);

const buffer *input_buffer = &first.get_buffer();
const buffer *output_buffer = &output.get_buffer();
const buffer *values_input_buffer = &values_first.get_buffer();
const buffer *values_output_buffer = &values_output.get_buffer();

for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
// write counts
Expand Down Expand Up @@ -302,16 +327,38 @@ inline void radix_sort(Iterator first,
scatter_kernel.set_arg(3, counts);
scatter_kernel.set_arg(4, offsets);
scatter_kernel.set_arg(5, *output_buffer);
if(sort_by_key){
scatter_kernel.set_arg(6, *values_input_buffer);
scatter_kernel.set_arg(7, *values_output_buffer);
}
queue.enqueue_1d_range_kernel(scatter_kernel,
0,
block_count * block_size,
block_size);

// swap buffers
std::swap(input_buffer, output_buffer);
std::swap(values_input_buffer, values_output_buffer);
}
}

template<class Iterator>
inline void radix_sort(Iterator first,
Iterator last,
command_queue &queue)
{
radix_sort_impl(first, last, buffer_iterator<int>(), queue);
}

template<class KeyIterator, class ValueIterator>
inline void radix_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
command_queue &queue)
{
radix_sort_impl(keys_first, keys_last, values_first, queue);
}

} // end detail namespace
} // end compute namespace
} // end boost namespace
Expand Down
14 changes: 13 additions & 1 deletion include/boost/compute/algorithm/sort_by_key.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>

namespace boost {
Expand Down Expand Up @@ -53,7 +54,18 @@ inline void sort_by_key(KeyIterator keys_first,
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;

sort_by_key(keys_first, keys_last, values_first, less<key_type>(), queue);
size_t count = detail::iterator_range_size(keys_first, keys_last);

if(count < 32){
detail::serial_insertion_sort_by_key(
keys_first, keys_last, values_first, less<key_type>(), queue
);
}
else {
detail::radix_sort_by_key(
keys_first, keys_last, values_first, queue
);
}
}

} // end compute namespace
Expand Down
1 change: 1 addition & 0 deletions perf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(BENCHMARKS
host_sort
saxpy
sort
sort_by_key
sort_float
)

Expand Down
75 changes: 75 additions & 0 deletions perf/perf_sort_by_key.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://kylelutz.github.com/compute for more information.
//---------------------------------------------------------------------------//

#include <algorithm>
#include <iostream>
#include <vector>

#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/sort_by_key.hpp>
#include <boost/compute/algorithm/is_sorted.hpp>
#include <boost/compute/container/vector.hpp>

#include "perf.hpp"

int main(int argc, char *argv[])
{
perf_parse_args(argc, argv);

std::cout << "size: " << PERF_N << std::endl;

// setup context and queue for the default device
boost::compute::device device = boost::compute::system::default_device();
boost::compute::context context(device);
boost::compute::command_queue queue(context, device);
std::cout << "device: " << device.name() << std::endl;

// create vector of random numbers on the host
std::vector<int> host_keys(PERF_N);
std::generate(host_keys.begin(), host_keys.end(), rand);
std::vector<long> host_values(PERF_N);
std::copy(host_keys.begin(), host_keys.end(), host_values.begin());

// create vector on the device and copy the data
boost::compute::vector<int> device_keys(PERF_N, context);
boost::compute::vector<long> device_values(PERF_N, context);

perf_timer t;
for(size_t trial = 0; trial < PERF_TRIALS; trial++){
boost::compute::copy(
host_keys.begin(), host_keys.end(), device_keys.begin(), queue
);
boost::compute::copy(
host_values.begin(), host_values.end(), device_values.begin(), queue
);

t.start();
// sort vector
boost::compute::sort_by_key(
device_keys.begin(), device_keys.end(), device_values.begin(), queue
);
queue.finish();
t.stop();
}
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;

// verify keys are sorted
if(!boost::compute::is_sorted(device_keys.begin(), device_keys.end(), queue)){
std::cout << "ERROR: is_sorted() returned false for the keys" << std::endl;
return -1;
}
// verify values are sorted
if(!boost::compute::is_sorted(device_values.begin(), device_values.end(), queue)){
std::cout << "ERROR: is_sorted() returned false for the values" << std::endl;
return -1;
}

return 0;
}
22 changes: 22 additions & 0 deletions test/test_sort_by_key.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,26 @@ BOOST_AUTO_TEST_CASE(sort_char_by_int)
CHECK_RANGE_EQUAL(char, 8, values, ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'));
}

BOOST_AUTO_TEST_CASE(sort_int_and_float)
{
int n = 1024;
std::vector<int> host_keys(n);
std::vector<float> host_values(n);
for(int i = 0; i < n; i++){
host_keys[i] = n - i;
host_values[i] = (n - i) / 2.f;
}

compute::vector<int> keys(host_keys.begin(), host_keys.end(), queue);
compute::vector<float> values(host_values.begin(), host_values.end(), queue);

BOOST_CHECK(compute::is_sorted(keys.begin(), keys.end(), queue) == false);
BOOST_CHECK(compute::is_sorted(values.begin(), values.end(), queue) == false);

compute::sort_by_key(keys.begin(), keys.end(), values.begin(), queue);

BOOST_CHECK(compute::is_sorted(keys.begin(), keys.end(), queue) == true);
BOOST_CHECK(compute::is_sorted(values.begin(), values.end(), queue) == true);
}

BOOST_AUTO_TEST_SUITE_END()

0 comments on commit bae7432

Please sign in to comment.