diff --git a/perf/CMakeLists.txt b/perf/CMakeLists.txt index 9e604db0c..99dddfedf 100644 --- a/perf/CMakeLists.txt +++ b/perf/CMakeLists.txt @@ -117,11 +117,17 @@ if(${BOOST_COMPUTE_HAVE_CUDA}) set(CUDA_BENCHMARKS thrust_accumulate thrust_count + thrust_exclusive_scan thrust_inner_product + thrust_merge thrust_partial_sum + thrust_partition + thrust_reverse + thrust_rotate thrust_saxpy + thrust_set_difference thrust_sort - thrust_exclusive_scan + thrust_unique ) foreach(BENCHMARK ${CUDA_BENCHMARKS}) diff --git a/perf/perf.py b/perf/perf.py index 94bf0d4f9..dafd49277 100755 --- a/perf/perf.py +++ b/perf/perf.py @@ -114,43 +114,55 @@ def run_benchmark(name, sizes, vs=[]): report.add_sample("compute", size, time) competitors = { - "thrust" : ["accumulate", - "count", - "inner_product", - "partial_sum", - "sort", - "saxpy"], - "tbb": ["accumulate", - "merge", - "sort"], - "stl": ["accumulate", - "count", - "find_end", - "includes", - "inner_product", - "is_permutation", - "max_element", - "merge", - "next_permutation", - "nth_element", - "partial_sum", - "partition", - "partition_point", - "prev_permutation", - "reverse", - "rotate", - "rotate_copy", - "saxpy", - "search", - "search_n", - "set_difference", - "set_intersection", - "set_symmetric_difference", - "set_union", - "sort", - "stable_partition", - "unique", - "unique_copy"] + "thrust" : [ + "accumulate", + "count", + "exclusive_scan", + "inner_product", + "merge", + "partial_sum", + "partition", + "reverse", + "rotate", + "saxpy", + "sort", + "unique" + ], + "tbb": [ + "accumulate", + "merge", + "sort" + ], + "stl": [ + "accumulate", + "count", + "find_end", + "includes", + "inner_product", + "is_permutation", + "max_element", + "merge", + "next_permutation", + "nth_element", + "partial_sum", + "partition", + "partition_point", + "prev_permutation", + "reverse", + "rotate", + "rotate_copy", + "saxpy", + "search", + "search_n", + "set_difference", + "set_intersection", + "set_symmetric_difference", + "set_union", + "sort", + "stable_partition", + "unique", + "unique_copy" + ] } for other in vs: diff --git a/perf/perf_partition.cpp b/perf/perf_partition.cpp index 311b473f7..68488d6b9 100644 --- a/perf/perf_partition.cpp +++ b/perf/perf_partition.cpp @@ -49,6 +49,10 @@ int main(int argc, char *argv[]) perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + boost::compute::copy( + host_vector.begin(), host_vector.end(), device_vector.begin(), queue + ); + t.start(); boost::compute::partition( device_vector.begin(), device_vector.end(), _1 < 10, queue diff --git a/perf/perf_set_difference.cpp b/perf/perf_set_difference.cpp index c113e5a7f..8bb1cdc5d 100644 --- a/perf/perf_set_difference.cpp +++ b/perf/perf_set_difference.cpp @@ -36,39 +36,40 @@ int main(int argc, char *argv[]) std::cout << "device: " << device.name() << std::endl; // create vectors of random numbers on the host - std::vector host_vector(PERF_N); - std::generate(host_vector.begin(), host_vector.end(), rand_int); - std::sort(host_vector.begin(), host_vector.end()); - - std::vector host_vector2(PERF_N); - std::generate(host_vector2.begin(), host_vector2.end(), rand_int); - std::sort(host_vector2.begin(), host_vector2.end()); + std::vector v1(std::floor(PERF_N / 2.0)); + std::vector v2(std::ceil(PERF_N / 2.0)); + std::generate(v1.begin(), v1.end(), rand_int); + std::generate(v2.begin(), v2.end(), rand_int); + std::sort(v1.begin(), v1.end()); + std::sort(v2.begin(), v2.end()); // create vectors on the device and copy the data - boost::compute::vector device_vector(PERF_N, context); + boost::compute::vector gpu_v1(PERF_N, context); + boost::compute::vector gpu_v2(PERF_N, context); + boost::compute::copy( - host_vector.begin(), host_vector.end(), device_vector.begin(), queue + v1.begin(), v1.end(), gpu_v1.begin(), queue ); - - boost::compute::vector device_vector2(PERF_N, context); boost::compute::copy( - host_vector2.begin(), host_vector2.end(), device_vector2.begin(), queue + v2.begin(), v2.end(), gpu_v2.begin(), queue ); - boost::compute::vector result(PERF_N, context); + boost::compute::vector gpu_v3(PERF_N, context); + boost::compute::vector::iterator gpu_v3_end; perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - boost::compute::set_difference( - device_vector.begin(), device_vector.begin(), - device_vector2.begin(), device_vector2.end(), - result.begin(), queue + gpu_v3_end = boost::compute::set_difference( + gpu_v1.begin(), gpu_v1.end(), + gpu_v2.begin(), gpu_v2.end(), + gpu_v3.begin(), queue ); queue.finish(); t.stop(); } std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl; return 0; } diff --git a/perf/perf_stl_set_difference.cpp b/perf/perf_stl_set_difference.cpp index 7172f243c..bf436b48a 100644 --- a/perf/perf_stl_set_difference.cpp +++ b/perf/perf_stl_set_difference.cpp @@ -25,21 +25,22 @@ int main(int argc, char *argv[]) std::cout << "size: " << PERF_N << std::endl; - std::vector v1(PERF_N); - std::generate(v1.begin(), v1.end(), rand_int); + std::vector v1(std::floor(PERF_N / 2.0)); + std::vector v2(std::ceil(PERF_N / 2.0)); - std::vector v2(PERF_N); + std::generate(v1.begin(), v1.end(), rand_int); std::generate(v2.begin(), v2.end(), rand_int); - std::vector v3(PERF_N); - std::sort(v1.begin(), v1.end()); std::sort(v2.begin(), v2.end()); + std::vector v3(PERF_N); + std::vector::iterator v3_end; + perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - std::set_difference( + v3_end = std::set_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin() @@ -47,6 +48,7 @@ int main(int argc, char *argv[]) t.stop(); } std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl; return 0; } diff --git a/perf/perf_thrust_merge.cu b/perf/perf_thrust_merge.cu new file mode 100644 index 000000000..4bded4674 --- /dev/null +++ b/perf/perf_thrust_merge.cu @@ -0,0 +1,63 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include +#include + +#include "perf.hpp" + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector v1(std::floor(PERF_N / 2.0)); + thrust::host_vector v2(std::ceil(PERF_N / 2.0)); + std::generate(v1.begin(), v1.end(), rand); + std::generate(v2.begin(), v2.end(), rand); + std::sort(v1.begin(), v1.end()); + std::sort(v2.begin(), v2.end()); + + // transfer data to the device + thrust::device_vector gpu_v1 = v1; + thrust::device_vector gpu_v2 = v2; + thrust::device_vector gpu_v3(PERF_N); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + thrust::merge( + gpu_v1.begin(), gpu_v1.end(), + gpu_v2.begin(), gpu_v2.end(), + gpu_v3.begin() + ); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + thrust::host_vector check_v3 = gpu_v3; + + thrust::host_vector v3(PERF_N); + std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin()); + bool ok = std::equal(check_v3.begin(), check_v3.end(), v3.begin()); + if(!ok){ + std::cerr << "ERROR: merged ranges different" << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_thrust_partition.cu b/perf/perf_thrust_partition.cu new file mode 100644 index 000000000..d259ec52e --- /dev/null +++ b/perf/perf_thrust_partition.cu @@ -0,0 +1,59 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +struct less_than_ten : public thrust::unary_function +{ + __device__ bool operator()(int x) const + { + return x < 10; + } +}; + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec(PERF_N); + std::generate(h_vec.begin(), h_vec.end(), rand_int); + + thrust::device_vector d_vec(PERF_N); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + d_vec = h_vec; + + t.start(); + thrust::partition( + d_vec.begin(), d_vec.end(), less_than_ten() + ); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/perf/perf_thrust_reverse.cu b/perf/perf_thrust_reverse.cu new file mode 100644 index 000000000..bf5f9053d --- /dev/null +++ b/perf/perf_thrust_reverse.cu @@ -0,0 +1,47 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.hpp" + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec = generate_random_vector(PERF_N); + + // transfer data to the device + thrust::device_vector d_vec; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + d_vec = h_vec; + + t.start(); + thrust::reverse(d_vec.begin(), d_vec.end()); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // transfer data back to host + thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); + + return 0; +} diff --git a/perf/perf_thrust_rotate.cu b/perf/perf_thrust_rotate.cu new file mode 100644 index 000000000..b9ee36dd2 --- /dev/null +++ b/perf/perf_thrust_rotate.cu @@ -0,0 +1,50 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec = generate_random_vector(PERF_N); + + // transfer data to the device + thrust::device_vector d_vec; + + size_t rotate_distance = PERF_N / 2; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + d_vec = h_vec; + + t.start(); + // there is no thrust::rotate() so we implement it manually with copy() + thrust::device_vector tmp(d_vec.begin(), d_vec.begin() + rotate_distance); + thrust::copy(d_vec.begin() + rotate_distance, d_vec.end(), d_vec.begin()); + thrust::copy(tmp.begin(), tmp.end(), d_vec.begin() + rotate_distance); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // transfer data back to host + thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); + + return 0; +} diff --git a/perf/perf_thrust_set_difference.cu b/perf/perf_thrust_set_difference.cu new file mode 100644 index 000000000..6b23b61a2 --- /dev/null +++ b/perf/perf_thrust_set_difference.cu @@ -0,0 +1,61 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector v1(std::floor(PERF_N / 2.0)); + thrust::host_vector v2(std::ceil(PERF_N / 2.0)); + std::generate(v1.begin(), v1.end(), rand_int); + std::generate(v2.begin(), v2.end(), rand_int); + std::sort(v1.begin(), v1.end()); + std::sort(v2.begin(), v2.end()); + + // transfer data to the device + thrust::device_vector gpu_v1 = v1; + thrust::device_vector gpu_v2 = v2; + thrust::device_vector gpu_v3(PERF_N); + + thrust::device_vector::iterator gpu_v3_end; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + gpu_v3_end = thrust::set_difference( + gpu_v1.begin(), gpu_v1.end(), + gpu_v2.begin(), gpu_v2.end(), + gpu_v3.begin() + ); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + std::cout << "size: " << thrust::distance(gpu_v3.begin(), gpu_v3_end) << std::endl; + + return 0; +} diff --git a/perf/perf_thrust_unique.cu b/perf/perf_thrust_unique.cu new file mode 100644 index 000000000..2fd7f6e95 --- /dev/null +++ b/perf/perf_thrust_unique.cu @@ -0,0 +1,49 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec(PERF_N); + std::generate(h_vec.begin(), h_vec.end(), rand_int); + + thrust::device_vector d_vec(PERF_N); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + d_vec = h_vec; + + t.start(); + thrust::unique(d_vec.begin(), d_vec.end()); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/perf/perfdoc.py b/perf/perfdoc.py index a3c9cd14d..2b53092b6 100755 --- a/perf/perfdoc.py +++ b/perf/perfdoc.py @@ -46,7 +46,6 @@ def plot_to_file(report, filename): "reverse", "rotate", "saxpy", - "set_difference", "sort", "unique", ]