From b1acf0ae74a87867525958235085ff829f56ae28 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 26 Mar 2015 12:27:47 +0100 Subject: [PATCH 1/2] Benchmarks for find() algorithm Boost.Compute, STL and Thrust benchmarks for find() algorithm. --- perf/CMakeLists.txt | 3 ++ perf/perf.py | 2 + perf/perf_find.cpp | 87 ++++++++++++++++++++++++++++++++++++++++ perf/perf_stl_find.cpp | 45 +++++++++++++++++++++ perf/perf_thrust_find.cu | 53 ++++++++++++++++++++++++ 5 files changed, 190 insertions(+) create mode 100644 perf/perf_find.cpp create mode 100644 perf/perf_stl_find.cpp create mode 100644 perf/perf_thrust_find.cu diff --git a/perf/CMakeLists.txt b/perf/CMakeLists.txt index 99dddfedf..57fa6333c 100644 --- a/perf/CMakeLists.txt +++ b/perf/CMakeLists.txt @@ -25,6 +25,7 @@ set(BENCHMARKS erase_remove exclusive_scan fill + find find_end includes inner_product @@ -70,6 +71,7 @@ endforeach() set(STL_BENCHMARKS stl_accumulate stl_count + stl_find stl_find_end stl_includes stl_inner_product @@ -118,6 +120,7 @@ if(${BOOST_COMPUTE_HAVE_CUDA}) thrust_accumulate thrust_count thrust_exclusive_scan + thrust_find thrust_inner_product thrust_merge thrust_partial_sum diff --git a/perf/perf.py b/perf/perf.py index dafd49277..2d117db49 100755 --- a/perf/perf.py +++ b/perf/perf.py @@ -118,6 +118,7 @@ def run_benchmark(name, sizes, vs=[]): "accumulate", "count", "exclusive_scan", + "find", "inner_product", "merge", "partial_sum", @@ -136,6 +137,7 @@ def run_benchmark(name, sizes, vs=[]): "stl": [ "accumulate", "count", + "find", "find_end", "includes", "inner_product", diff --git a/perf/perf_find.cpp b/perf/perf_find.cpp new file mode 100644 index 000000000..95adfe438 --- /dev/null +++ b/perf/perf_find.cpp @@ -0,0 +1,87 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // setup context and queue for the default device + boost::compute::device device = boost::compute::system::default_device(); + boost::compute::context context(device); + boost::compute::command_queue queue(context, device); + std::cout << "device: " << device.name() << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // create vector on the device and copy the data + boost::compute::vector device_vector(PERF_N, context); + boost::compute::copy( + host_vector.begin(), + host_vector.end(), + device_vector.begin(), + queue + ); + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + + // device iterator + boost::compute::vector::iterator device_it; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + device_it = boost::compute::find( + device_vector.begin(), device_vector.end(), wanted, queue + ); + queue.finish(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // verify if found index is correct by comparing it with std::find() result + size_t host_index = std::distance(host_vector.begin(), + std::find(host_vector.begin(), + host_vector.end(), + wanted)); + size_t device_index = device_it.get_index(); + + if(device_index != host_index){ + std::cout << "ERROR: " + << "device_index (" << device_index << ") " + << "!= " + << "host_index (" << host_index << ")" + << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_stl_find.cpp b/perf/perf_stl_find.cpp new file mode 100644 index 000000000..20d948f5e --- /dev/null +++ b/perf/perf_stl_find.cpp @@ -0,0 +1,45 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + std::find(host_vector.begin(), host_vector.end(), wanted); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/perf/perf_thrust_find.cu b/perf/perf_thrust_find.cu new file mode 100644 index 000000000..0c8b2c7a8 --- /dev/null +++ b/perf/perf_thrust_find.cu @@ -0,0 +1,53 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // create vector of random numbers on the host + thrust::host_vector host_vector(PERF_N); + thrust::generate(host_vector.begin(), host_vector.end(), rand_int); + + thrust::device_vector v = host_vector; + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + thrust::find(v.begin(), v.end(), wanted); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} From a65de68e0dc21f20aad5280e21349c31434ac065 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Fri, 27 Mar 2015 09:09:45 +0100 Subject: [PATCH 2/2] Making sure compiler won't cut out find() calls Making sure that compiler won't cut out std::find() and thrust:find() calls while optimizing the code. --- perf/perf_find.cpp | 25 +++++++++++++------------ perf/perf_stl_find.cpp | 15 ++++++++++++++- perf/perf_thrust_find.cu | 14 +++++++++++++- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/perf/perf_find.cpp b/perf/perf_find.cpp index 95adfe438..55f5e5896 100644 --- a/perf/perf_find.cpp +++ b/perf/perf_find.cpp @@ -54,31 +54,32 @@ int main(int argc, char *argv[]) int wanted = rand_int_max + 1; // device iterator - boost::compute::vector::iterator device_it; + boost::compute::vector::iterator device_result_it; perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - device_it = boost::compute::find( - device_vector.begin(), device_vector.end(), wanted, queue - ); + device_result_it = boost::compute::find(device_vector.begin(), + device_vector.end(), + wanted, + queue); queue.finish(); t.stop(); } std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; // verify if found index is correct by comparing it with std::find() result - size_t host_index = std::distance(host_vector.begin(), - std::find(host_vector.begin(), - host_vector.end(), - wanted)); - size_t device_index = device_it.get_index(); + size_t host_result_index = std::distance(host_vector.begin(), + std::find(host_vector.begin(), + host_vector.end(), + wanted)); + size_t device_result_index = device_result_it.get_index(); - if(device_index != host_index){ + if(device_result_index != host_result_index){ std::cout << "ERROR: " - << "device_index (" << device_index << ") " + << "device_result_index (" << device_result_index << ") " << "!= " - << "host_index (" << host_index << ")" + << "host_result_index (" << host_result_index << ")" << std::endl; return -1; } diff --git a/perf/perf_stl_find.cpp b/perf/perf_stl_find.cpp index 20d948f5e..b2945d9c7 100644 --- a/perf/perf_stl_find.cpp +++ b/perf/perf_stl_find.cpp @@ -33,13 +33,26 @@ int main(int argc, char *argv[]) // trying to find element that isn't in vector (worst-case scenario) int wanted = rand_int_max + 1; + + // result + std::vector::iterator host_result_it; + perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - std::find(host_vector.begin(), host_vector.end(), wanted); + host_result_it = std::find(host_vector.begin(), host_vector.end(), wanted); t.stop(); } std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + // verify + if(host_result_it != host_vector.end()){ + std::cout << "ERROR: " + << "host_result_iterator != " + << "host_vector.end()" + << std::endl; + return -1; + } + return 0; } diff --git a/perf/perf_thrust_find.cu b/perf/perf_thrust_find.cu index 0c8b2c7a8..a0e89c9a6 100644 --- a/perf/perf_thrust_find.cu +++ b/perf/perf_thrust_find.cu @@ -40,14 +40,26 @@ int main(int argc, char *argv[]) // trying to find element that isn't in vector (worst-case scenario) int wanted = rand_int_max + 1; + // result + thrust::device_vector::iterator device_result_it; + perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - thrust::find(v.begin(), v.end(), wanted); + device_result_it = thrust::find(v.begin(), v.end(), wanted); cudaDeviceSynchronize(); t.stop(); } std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + // verify + if(device_result_it != v.end()){ + std::cout << "ERROR: " + << "device_result_iterator != " + << "v.end()" + << std::endl; + return -1; + } + return 0; }