From a830531445f9a991b25bc71613192741f503ab1c Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 19 May 2016 23:30:58 +0200 Subject: [PATCH 01/21] Fix type-safety issues when copying host->device --- include/boost/compute/algorithm/copy.hpp | 116 +++++++++++++++++- .../algorithm/detail/copy_to_device.hpp | 43 +++++++ 2 files changed, 155 insertions(+), 4 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index c391c97cd..52fe5de56 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -27,10 +27,13 @@ #include #include #include +#include #include #include #include +#include #include +#include #include namespace boost { @@ -70,6 +73,19 @@ struct can_copy_with_copy_buffer : > >::type {}; +// meta-function returning true if value_types of InputIterator and +// OutputIterator are same +template +struct is_same_value_type : + boost::is_same< + typename boost::remove_cv< + typename std::iterator_traits::value_type + >::type, + typename boost::remove_cv< + typename OutputIterator::value_type + >::type + >::type {}; + // host -> device template inline OutputIterator @@ -77,9 +93,14 @@ dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - !is_device_iterator::value && - is_device_iterator::value + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + is_same_value_type + > >::type* = 0) { if(is_contiguous_iterator::value){ @@ -94,6 +115,93 @@ dispatch_copy(InputIterator first, } } +// host -> device +// Type mismatch between InputIterator and OutputIterator value_types +template +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + mpl::not_< + is_same_value_type + > + > + >::type* = 0) +{ + typedef typename OutputIterator::value_type output_type; + typedef typename std::iterator_traits::value_type input_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_device_") + + type_name() + "_" + type_name(); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t input_size = iterator_range_size(first, last); + size_t input_size_bytes = input_size * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_device_map() + if(input_size_bytes < map_copy_threshold) { + return copy_to_device_map(first, last, result, queue); + } + // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) + // on host and then perform copy_to_device() + else if(input_size_bytes < direct_copy_threshold) { + std::vector vector(first, last); + return copy_to_device(vector.begin(), vector.end(), result, queue); + } + + // [direct_copy_threshold; inf) -> map [first; last) to device and + // run copy kernel on device for copying & casting + ::boost::compute::mapped_view mapped_host( + // make sure it's a pointer to constant data + // to force read only mapping + const_cast( + ::boost::addressof(*first) + ), + input_size, + context + ); + return copy_on_device(mapped_host.begin(), mapped_host.end(), result, queue); +} + // host -> device (async) template inline future @@ -130,7 +238,7 @@ dispatch_copy(InputIterator first, return copy_to_host(first, last, result, queue); } else { - // for non-contiguous input we first copy the values to + // for non-contiguous output we first copy the values to // a temporary std::vector and then copy from there typedef typename std::iterator_traits::value_type T; std::vector vector(iterator_range_size(first, last)); diff --git a/include/boost/compute/algorithm/detail/copy_to_device.hpp b/include/boost/compute/algorithm/detail/copy_to_device.hpp index 90545fb4e..c62d04fd3 100644 --- a/include/boost/compute/algorithm/detail/copy_to_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_device.hpp @@ -52,6 +52,49 @@ inline DeviceIterator copy_to_device(HostIterator first, return result + static_cast(count); } +template +inline DeviceIterator copy_to_device_map(HostIterator first, + HostIterator last, + DeviceIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits::value_type + value_type; + typedef typename + std::iterator_traits::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + size_t offset = result.get_index(); + + // map result buffer to host + value_type *pointer = static_cast( + queue.enqueue_map_buffer( + result.get_buffer(), + CL_MAP_WRITE, + offset * sizeof(value_type), + count * sizeof(value_type) + ) + ); + + // copy [first; last) to result buffer + std::copy(first, last, pointer); + + // unmap result buffer + boost::compute::event unmapEvent = queue.enqueue_unmap_buffer( + result.get_buffer(), + static_cast(pointer) + ); + unmapEvent.wait(); + + return result + static_cast(count); +} + template inline future copy_to_device_async(HostIterator first, HostIterator last, From bba6c61ed4a5257b7f0e8d566b895c9a539e94a5 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 19 May 2016 23:31:54 +0200 Subject: [PATCH 02/21] Add tests for copying host->device with mismatched types --- test/CMakeLists.txt | 1 + test/test_copy_type_mismatch.cpp | 241 +++++++++++++++++++++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 test/test_copy_type_mismatch.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 99e41c1d4..25e6d4de6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -92,6 +92,7 @@ add_compute_test("algorithm.adjacent_find" test_adjacent_find.cpp) add_compute_test("algorithm.any_all_none_of" test_any_all_none_of.cpp) add_compute_test("algorithm.binary_search" test_binary_search.cpp) add_compute_test("algorithm.copy" test_copy.cpp) +add_compute_test("algorithm.copy_type_mismatch" test_copy_type_mismatch.cpp) add_compute_test("algorithm.copy_if" test_copy_if.cpp) add_compute_test("algorithm.count" test_count.cpp) add_compute_test("algorithm.equal" test_equal.cpp) diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp new file mode 100644 index 000000000..885f969fa --- /dev/null +++ b/test/test_copy_type_mismatch.cpp @@ -0,0 +1,241 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +// Undefining BOOST_COMPUTE_USE_OFFLINE_CACHE macro as we want to modify cached +// parameters for copy algorithm without any undesirable consequences (like +// saving modified values of those parameters). +#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE + #undef BOOST_COMPUTE_USE_OFFLINE_CACHE +#endif + +#define BOOST_TEST_MODULE TestCopyTypeMismatch +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "quirks.hpp" +#include "check_macros.hpp" +#include "context_setup.hpp" + +namespace bc = boost::compute; +namespace compute = boost::compute; + +BOOST_AUTO_TEST_CASE(is_same_ignore_const) +{ + BOOST_STATIC_ASSERT(( + boost::compute::detail::is_same_value_type< + std::vector::iterator, + compute::buffer_iterator + >::value + )); + BOOST_STATIC_ASSERT(( + boost::compute::detail::is_same_value_type< + std::vector::const_iterator, + compute::buffer_iterator + >::value + )); + BOOST_STATIC_ASSERT(( + boost::compute::detail::is_same_value_type< + std::vector::iterator, + compute::buffer_iterator + >::value + )); + BOOST_STATIC_ASSERT(( + boost::compute::detail::is_same_value_type< + std::vector::const_iterator, + compute::buffer_iterator + >::value + )); +} + +BOOST_AUTO_TEST_CASE(copy_host_float_to_device_double) +{ + if(!device.supports_extension("cl_khr_fp64")) { + std::cout << "skipping test: device does not support double" << std::endl; + return; + } + + using compute::double_; + using compute::float_; + + float_ host[] = { 6.1f, 10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + // copy host float data to double device vector + bc::copy(host, host + 4, device_vector.begin(), queue); + CHECK_RANGE_EQUAL(double_, 4, device_vector, (6.1f, 10.2f, 19.3f, 25.4f)); +} + +BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int) +{ + using compute::int_; + using compute::float_; + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + // copy host float data to int device vector + bc::copy(host, host + 4, device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + +BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_mapping_device_vector) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_device_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + // copy host float data to int device vector + bc::copy(host, host + 4, device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_convert_on_host) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by casting input data on host and performing + // normal copy host->device (since types match now) + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + // copy host float data to int device vector + bc::copy(host, host + 4, device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_with_transform) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by mapping input data to the device memory + // and using transform operation for casting & copying + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 0); + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + // copy host float data to int device vector + bc::copy(host, host + 4, device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_SUITE_END() From 3493b750e828182b4c6bcc69af884a5ff4d48b55 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 21 May 2016 16:57:45 +0200 Subject: [PATCH 03/21] Fix type, long_ and long might be different types --- perf/perf_sort_by_key.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf/perf_sort_by_key.cpp b/perf/perf_sort_by_key.cpp index 3eb35c893..57c3fc837 100644 --- a/perf/perf_sort_by_key.cpp +++ b/perf/perf_sort_by_key.cpp @@ -36,9 +36,9 @@ int main(int argc, char *argv[]) std::cout << "device: " << device.name() << std::endl; // create vector of random numbers on the host - std::vector host_keys(PERF_N); + std::vector host_keys(PERF_N); std::generate(host_keys.begin(), host_keys.end(), rand); - std::vector host_values(PERF_N); + std::vector host_values(PERF_N); std::copy(host_keys.begin(), host_keys.end(), host_values.begin()); // create vector on the device and copy the data From d12e07c0e9f96aca2f54652f76743c5e488c8db5 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 21 May 2016 16:58:15 +0200 Subject: [PATCH 04/21] Fix type-safety issues for async copying host->device --- include/boost/compute/algorithm/copy.hpp | 57 ++++++++++++++++++++++-- test/test_copy_type_mismatch.cpp | 26 +++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index 52fe5de56..655485a6b 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -209,9 +209,14 @@ dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - !is_device_iterator::value && - is_device_iterator::value + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + is_same_value_type + > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( @@ -222,6 +227,52 @@ dispatch_copy_async(InputIterator first, return copy_to_device_async(first, last, result, queue); } +// host -> device (async) +// Type mismatch between InputIterator and OutputIterator value_types +template +inline future +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + mpl::not_< + is_same_value_type + > + > + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator::value, + "copy_async() is only supported for contiguous host iterators" + ); + + typedef typename std::iterator_traits::value_type input_type; + + const context &context = queue.get_context(); + size_t input_size = iterator_range_size(first, last); + + // map [first; last) to device and run copy kernel + // on device for copying & casting + ::boost::compute::mapped_view mapped_host( + // make sure it's a pointer to constant data + // to force read only mapping + const_cast( + ::boost::addressof(*first) + ), + input_size, + context + ); + return copy_on_device_async( + mapped_host.begin(), mapped_host.end(), result, queue + ); +} + // device -> host template inline OutputIterator diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index 885f969fa..f84b0cccf 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -238,4 +238,30 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_with_transform) parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); } +BOOST_AUTO_TEST_CASE(copy_async_host_float_to_device_int) +{ + using compute::int_; + using compute::float_; + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(4, context); + + // copy host float data to int device vector + compute::future future = + bc::copy_async(host, host + 4, device_vector.begin(), queue); + future.wait(); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + BOOST_AUTO_TEST_SUITE_END() From 7c3d0c20e9bd59a67c646fad59825507e81c34e6 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 21 May 2016 15:10:22 +0200 Subject: [PATCH 05/21] Add tests for coping on device with mismatched types --- test/test_copy_type_mismatch.cpp | 62 ++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index f84b0cccf..aedf8e6a2 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -264,4 +264,66 @@ BOOST_AUTO_TEST_CASE(copy_async_host_float_to_device_int) ); } +BOOST_AUTO_TEST_CASE(copy_device_float_to_device_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_fvector(data, data + 4, queue); + bc::vector device_ivector(4, context); + + // copy device float vector to device int vector + bc::copy( + device_fvector.begin(), + device_fvector.end(), + device_ivector.begin(), + queue + ); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_ivector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + +BOOST_AUTO_TEST_CASE(copy_async_device_float_to_device_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_fvector(data, data + 4, queue); + bc::vector device_ivector(4, context); + + // copy device float vector to device int vector + compute::future future = + bc::copy_async( + device_fvector.begin(), + device_fvector.end(), + device_ivector.begin(), + queue + ); + future.wait(); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_ivector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + BOOST_AUTO_TEST_SUITE_END() From ce8230b6f51060c0b8834c5604127e4398b70ff6 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 21 May 2016 17:16:13 +0200 Subject: [PATCH 06/21] Coping from host for non-contiguous iterators (host) When InputIterator (host) is a non-contiguous iterator we don't need a separate algorithm for cases when value_types of InputIterator and OutputIterator (device) do not match and cases when they do match. --- include/boost/compute/algorithm/copy.hpp | 94 +++++++++++++++++++++--- test/test_copy_type_mismatch.cpp | 86 ++++++++++++++++++++++ 2 files changed, 168 insertions(+), 12 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index 655485a6b..7e8108652 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -87,6 +87,7 @@ struct is_same_value_type : >::type {}; // host -> device +// InputIterator is a contiguous iterator template inline OutputIterator dispatch_copy(InputIterator first, @@ -99,24 +100,17 @@ dispatch_copy(InputIterator first, is_device_iterator >, is_device_iterator, - is_same_value_type + is_same_value_type, + is_contiguous_iterator > >::type* = 0) { - if(is_contiguous_iterator::value){ - return copy_to_device(first, last, result, queue); - } - else { - // for non-contiguous input we first copy the values to - // a temporary std::vector and then copy from there - typedef typename std::iterator_traits::value_type T; - std::vector vector(first, last); - return copy_to_device(vector.begin(), vector.end(), result, queue); - } + return copy_to_device(first, last, result, queue); } // host -> device // Type mismatch between InputIterator and OutputIterator value_types +// InputIterator is a contiguous iterator template inline OutputIterator dispatch_copy(InputIterator first, @@ -131,7 +125,8 @@ dispatch_copy(InputIterator first, is_device_iterator, mpl::not_< is_same_value_type - > + >, + is_contiguous_iterator > >::type* = 0) { @@ -202,6 +197,81 @@ dispatch_copy(InputIterator first, return copy_on_device(mapped_host.begin(), mapped_host.end(), result, queue); } +// host -> device +// InputIterator is NOT a contiguous iterator +template +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + mpl::not_< + is_contiguous_iterator + > + > + >::type* = 0) +{ + typedef typename OutputIterator::value_type output_type; + typedef typename std::iterator_traits::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_device_") + + type_name() + "_" + type_name(); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t input_size = iterator_range_size(first, last); + size_t input_size_bytes = input_size * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_device_map() + // + // if direct_copy_threshold is less than map_copy_threshold + // copy_to_device_map() is used for every input + if(input_size_bytes < map_copy_threshold + || direct_copy_threshold <= map_copy_threshold) { + return copy_to_device_map(first, last, result, queue); + } + // [map_copy_threshold; inf) -> convert [first; last) + // on host and then perform copy_to_device() + std::vector vector(first, last); + return copy_to_device(vector.begin(), vector.end(), result, queue); +} + // host -> device (async) template inline future diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index aedf8e6a2..1965f1439 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -326,4 +326,90 @@ BOOST_AUTO_TEST_CASE(copy_async_device_float_to_device_int) ); } +// Test copying from a std::list to a bc::vector. This differs from +// the test copying from std::vector because std::list has non-contigous +// storage for its data values. +BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_map) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_device_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + std::list host(data, data + 4); + bc::vector device_vector(4, context); + + // copy host float data to int device vector + bc::copy(host.begin(), host.end(), device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + +// Test copying from a std::list to a bc::vector. This differs from +// the test copying from std::vector because std::list has non-contigous +// storage for its data values. +BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_device_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 0); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + std::list host(data, data + 4); + bc::vector device_vector(4, context); + + // copy host float data to int device vector + bc::copy(host.begin(), host.end(), device_vector.begin(), queue); + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + BOOST_AUTO_TEST_SUITE_END() From 1419b7d36f415b2e326624a2d8f0dd1f175087e5 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sun, 22 May 2016 14:58:31 +0200 Subject: [PATCH 07/21] Type-safe copying from device to host Type-safe copying from device to host. Seperate copying algorithm device -> host for non-contiguous OutputIterator (host). --- include/boost/compute/algorithm/copy.hpp | 229 ++++++++++++++- .../compute/algorithm/detail/copy_to_host.hpp | 54 +++- test/test_copy.cpp | 11 +- test/test_copy_type_mismatch.cpp | 261 +++++++++++++++++- 4 files changed, 525 insertions(+), 30 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index 7e8108652..a25aff469 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -73,19 +73,29 @@ struct can_copy_with_copy_buffer : > >::type {}; -// meta-function returning true if value_types of InputIterator and -// OutputIterator are same -template +// meta-function returning true if value_types of HostIterator and +// DeviceIterator are same +template struct is_same_value_type : boost::is_same< typename boost::remove_cv< - typename std::iterator_traits::value_type + typename std::iterator_traits::value_type >::type, typename boost::remove_cv< - typename OutputIterator::value_type + typename DeviceIterator::value_type >::type >::type {}; +// meta-function returning true if value_type of HostIterator is bool +template +struct is_bool_value_type : + boost::is_same< + typename boost::remove_cv< + typename std::iterator_traits::value_type + >::type, + bool + >::type {}; + // host -> device // InputIterator is a contiguous iterator template @@ -344,28 +354,219 @@ dispatch_copy_async(InputIterator first, } // device -> host +// OutputIterator is a contiguous iterator template inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - is_device_iterator::value && - !is_device_iterator::value + typename boost::enable_if< + mpl::and_< + is_device_iterator, + mpl::not_< + is_device_iterator + >, + is_same_value_type, + is_contiguous_iterator, + mpl::not_< + is_bool_value_type + > + > + >::type* = 0) +{ + return copy_to_host(first, last, result, queue); +} + +// device -> host +// Type mismatch between InputIterator and OutputIterator value_types +// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator +// is a boolean type. +template +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator, + mpl::not_< + is_device_iterator + >, + mpl::or_< + mpl::not_< + is_contiguous_iterator + >, + is_bool_value_type + > + > >::type* = 0) { - if(is_contiguous_iterator::value){ - return copy_to_host(first, last, result, queue); + typedef typename std::iterator_traits::value_type output_type; + typedef typename InputIterator::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_host_") + + type_name() + "_" + type_name(); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 33554432; // 30 MB + direct_copy_threshold = 0; // it's never efficient for GPUs } else { - // for non-contiguous output we first copy the values to - // a temporary std::vector and then copy from there - typedef typename std::iterator_traits::value_type T; - std::vector vector(iterator_range_size(first, last)); + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_host_map() + // + // if direct_copy_threshold is less than map_copy_threshold + // copy_to_host_map() is used for every input + if(input_size_bytes < map_copy_threshold + || direct_copy_threshold <= map_copy_threshold) { + return copy_to_host_map(first, last, result, queue); + } + // [map_copy_threshold; inf) -> copy [first;last) to temporary vector + // then copy (and convert) to result using std::copy() + std::vector vector(count); + copy_to_host(first, last, vector.begin(), queue); + return std::copy(vector.begin(), vector.end(), result); +} + +// device -> host +// Type mismatch between InputIterator and OutputIterator value_types +// OutputIterator is a contiguous iterator +// value_type of OutputIterator is NOT a boolean type +template +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator, + mpl::not_< + is_device_iterator + >, + mpl::not_< + is_same_value_type + >, + is_contiguous_iterator, + mpl::not_< + is_bool_value_type + > + > + >::type* = 0) +{ + typedef typename std::iterator_traits::value_type output_type; + typedef typename InputIterator::value_type input_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_host_") + + type_name() + "_" + type_name(); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_host_map() + if(input_size_bytes < map_copy_threshold) { + return copy_to_host_map(first, last, result, queue); + } + // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to + // temporary vector then copy (and convert) to result using std::copy() + else if(input_size_bytes < direct_copy_threshold) { + std::vector vector(count); copy_to_host(first, last, vector.begin(), queue); return std::copy(vector.begin(), vector.end(), result); } + + // [direct_copy_threshold; inf) -> map [result; result + input_size) to + // device and run copy kernel on device for copying & casting + // map host memory to device + buffer mapped_host( + context, + count * sizeof(output_type), + buffer::write_only | buffer::use_host_ptr, + static_cast( + ::boost::addressof(*result) + ) + ); + copy_on_device( + first, + last, + make_buffer_iterator(mapped_host), + queue + ); + // update host memory asynchronously by maping and unmaping memory + event map_event; + void* ptr = queue.enqueue_map_buffer_async( + mapped_host, + CL_MAP_READ, + 0, + count * sizeof(output_type), + map_event + ); + queue.enqueue_unmap_buffer(mapped_host, ptr, map_event).wait(); + return iterator_plus_distance(result, count); } // device -> host (async) diff --git a/include/boost/compute/algorithm/detail/copy_to_host.hpp b/include/boost/compute/algorithm/detail/copy_to_host.hpp index b889e0c87..abd4abf1d 100644 --- a/include/boost/compute/algorithm/detail/copy_to_host.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_host.hpp @@ -51,17 +51,51 @@ inline HostIterator copy_to_host(DeviceIterator first, return iterator_plus_distance(result, count); } -// copy_to_host() specialization for std::vector -template -inline std::vector::iterator -copy_to_host(DeviceIterator first, - DeviceIterator last, - std::vector::iterator result, - command_queue &queue) +template +inline HostIterator copy_to_host_map(DeviceIterator first, + DeviceIterator last, + HostIterator result, + command_queue &queue) { - std::vector temp(std::distance(first, last)); - copy_to_host(first, last, temp.begin(), queue); - return std::copy(temp.begin(), temp.end(), result); + typedef typename + std::iterator_traits::value_type + value_type; + typedef typename + std::iterator_traits::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + size_t offset = first.get_index(); + + // map [first; last) buffer to host + value_type *pointer = static_cast( + queue.enqueue_map_buffer( + first.get_buffer(), + CL_MAP_READ, + offset * sizeof(value_type), + count * sizeof(value_type) + ) + ); + + // copy [first; last) to result buffer + std::copy( + pointer, + pointer + static_cast(count), + result + ); + + // unmap [first; last) + boost::compute::event unmapEvent = queue.enqueue_unmap_buffer( + first.get_buffer(), + static_cast(pointer) + ); + unmapEvent.wait(); + + return iterator_plus_distance(result, count); } template diff --git a/test/test_copy.cpp b/test/test_copy.cpp index e292f6497..be417bdd0 100644 --- a/test/test_copy.cpp +++ b/test/test_copy.cpp @@ -337,10 +337,15 @@ BOOST_AUTO_TEST_CASE(copy_to_vector_bool) { using compute::uchar_; - compute::vector vec(context); - vec.push_back(true, queue); - vec.push_back(false, queue); + compute::vector vec(2, context); + // copy to device + bool data[] = {true, false}; + compute::copy(data, data + 2, vec.begin(), queue); + BOOST_CHECK(static_cast(vec[0]) == true); + BOOST_CHECK(static_cast(vec[1]) == false); + + // copy to host std::vector host_vec(vec.size()); compute::copy(vec.begin(), vec.end(), host_vec.begin(), queue); BOOST_CHECK(host_vec[0] == true); diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index 1965f1439..a180869fd 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -327,7 +327,7 @@ BOOST_AUTO_TEST_CASE(copy_async_device_float_to_device_int) } // Test copying from a std::list to a bc::vector. This differs from -// the test copying from std::vector because std::list has non-contigous +// the test copying from std::vector because std::list has non-contiguous // storage for its data values. BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_map) { @@ -370,7 +370,7 @@ BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_map) } // Test copying from a std::list to a bc::vector. This differs from -// the test copying from std::vector because std::list has non-contigous +// the test copying from std::vector because std::list has non-contiguous // storage for its data values. BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) { @@ -386,9 +386,13 @@ BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) // save uint_ map_copy_threshold = parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); - // force copy_to_device_map (mapping device vector to the host) + // force copying by casting input data on host and performing + // normal copy host->device (since types match now) parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; std::list host(data, data + 4); @@ -408,8 +412,259 @@ BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) ) ); + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + + +// DEVICE -> HOST + +BOOST_AUTO_TEST_CASE(copy_device_float_to_host_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::vector host_vector(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_vector.begin(), queue); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + +BOOST_AUTO_TEST_CASE(copy_to_host_float_to_int_map) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_host_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::vector host_vector(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_vector.begin(), queue); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + // restore parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); } +BOOST_AUTO_TEST_CASE(copy_to_host_float_to_int_convert_on_host) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by copying input device vector to temporary + // host vector of the same type and then copying from that temporary + // vector to result using std::copy() + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::vector host_vector(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_vector.begin(), queue); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_to_host_float_to_int_convert_on_device) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by mapping output data to the device memory + // and using transform operation for casting & copying + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 0); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::vector host_vector(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_vector.begin(), queue); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +// Test copying from a bc::vector to a std::list . This differs from +// the test copying to std::vector because std::list has non-contiguous +// storage for its data values. +BOOST_AUTO_TEST_CASE(copy_to_host_list_float_to_int_map) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_host_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::list host_list(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_list.begin(), queue); + + int_ expected[4] = { + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + }; + BOOST_CHECK_EQUAL_COLLECTIONS( + host_list.begin(), host_list.end(), + expected, expected + 4 + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + +// Test copying from a bc::vector to a std::list . This differs from +// the test copying to std::vector because std::list has non-contiguous +// storage for its data values. +BOOST_AUTO_TEST_CASE(copy_to_host_list_float_to_int_covert_on_host) +{ + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by copying input device vector to temporary + // host vector of the same type and then copying from that temporary + // vector to result using std::copy() + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + + std::list host_list(4); + // copy device float vector to int host vector + bc::copy(device_vector.begin(), device_vector.end(), host_list.begin(), queue); + int_ expected[4] = { + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + }; + BOOST_CHECK_EQUAL_COLLECTIONS( + host_list.begin(), host_list.end(), + expected, expected + 4 + ); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + + BOOST_AUTO_TEST_SUITE_END() From 21878219758cdcbb1d8e49610b71e27930359335 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sun, 22 May 2016 23:04:44 +0200 Subject: [PATCH 08/21] Make sure there's no macro redefinition --- example/opencv_histogram.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/example/opencv_histogram.cpp b/example/opencv_histogram.cpp index a36a37348..788098e24 100644 --- a/example/opencv_histogram.cpp +++ b/example/opencv_histogram.cpp @@ -11,7 +11,9 @@ //Code sample for calculating histogram using OpenCL and //displaying image histogram in OpenCV. -#define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION +#ifndef BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION + #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION +#endif #include #include From 55ac7da9d881f4b2fc4620d57ed5d09a3d334f5f Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sun, 22 May 2016 23:05:41 +0200 Subject: [PATCH 09/21] Fix test in test_functional_as.cpp At the end of test we should read from input vector (not output) in order to check if transform() with as() was performed correctly. --- test/test_functional_as.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_functional_as.cpp b/test/test_functional_as.cpp index 4e41e0221..f1318b0ab 100644 --- a/test/test_functional_as.cpp +++ b/test/test_functional_as.cpp @@ -52,7 +52,7 @@ BOOST_AUTO_TEST_CASE(roundtrip_int_float) // check values CHECK_RANGE_EQUAL( - int, 8, output, (1, 2, 3, 4, 5, 6, 7, 8) + int, 8, input, (1, 2, 3, 4, 5, 6, 7, 8) ); } From 7461c92fdc93f55d00225cb5c35b5cbbe9f1dc4a Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 24 May 2016 00:23:51 +0200 Subject: [PATCH 10/21] Add test for async coping with svm_ptr<> --- test/test_copy.cpp | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/test/test_copy.cpp b/test/test_copy.cpp index be417bdd0..c3370b9d2 100644 --- a/test/test_copy.cpp +++ b/test/test_copy.cpp @@ -315,19 +315,48 @@ BOOST_AUTO_TEST_CASE(copy_svm_ptr) { REQUIRES_OPENCL_VERSION(2, 0); + using boost::compute::int_; + if(bug_in_svmmemcpy(device)){ std::cerr << "skipping copy_svm_ptr test case" << std::endl; return; } - int data[] = { 1, 3, 2, 4 }; + int_ data[] = { 1, 3, 2, 4 }; - compute::svm_ptr ptr = compute::svm_alloc(context, 4); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); compute::copy(data, data + 4, ptr, queue); - int output[] = { 0, 0, 0, 0 }; + int_ output[] = { 0, 0, 0, 0 }; compute::copy(ptr, ptr + 4, output, queue); - CHECK_HOST_RANGE_EQUAL(int, 4, output, (1, 3, 2, 4)); + CHECK_HOST_RANGE_EQUAL(int_, 4, output, (1, 3, 2, 4)); + + compute::svm_free(context, ptr); +} + +BOOST_AUTO_TEST_CASE(copy_async_svm_ptr) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using boost::compute::int_; + + if(bug_in_svmmemcpy(device)){ + std::cerr << "skipping copy_svm_ptr test case" << std::endl; + return; + } + + int_ data[] = { 1, 3, 2, 4 }; + + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + boost::compute::future future = + compute::copy_async(data, data + 4, ptr, queue); + future.wait(); + + int_ output[] = { 0, 0, 0, 0 }; + future = + compute::copy_async(ptr, ptr + 4, output, queue); + future.wait(); + CHECK_HOST_RANGE_EQUAL(int_, 4, output, (1, 3, 2, 4)); compute::svm_free(context, ptr); } From 3dcbd4c001a5463288297ee70eb27e195162b613 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 24 May 2016 00:56:31 +0200 Subject: [PATCH 11/21] Fix async coping svm_ptr<> from/to/on device --- include/boost/compute/algorithm/detail/copy_on_device.hpp | 2 +- include/boost/compute/algorithm/detail/copy_to_device.hpp | 2 +- include/boost/compute/algorithm/detail/copy_to_host.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/boost/compute/algorithm/detail/copy_on_device.hpp b/include/boost/compute/algorithm/detail/copy_on_device.hpp index 0bcee27ed..59c166e1b 100644 --- a/include/boost/compute/algorithm/detail/copy_on_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_on_device.hpp @@ -172,7 +172,7 @@ inline future > copy_on_device_async(svm_ptr first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future >(); } event event_ = queue.enqueue_svm_memcpy_async( diff --git a/include/boost/compute/algorithm/detail/copy_to_device.hpp b/include/boost/compute/algorithm/detail/copy_to_device.hpp index c62d04fd3..ec058c18d 100644 --- a/include/boost/compute/algorithm/detail/copy_to_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_device.hpp @@ -152,7 +152,7 @@ inline future > copy_to_device_async(HostIterator first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future >(); } event event_ = queue.enqueue_svm_memcpy_async( diff --git a/include/boost/compute/algorithm/detail/copy_to_host.hpp b/include/boost/compute/algorithm/detail/copy_to_host.hpp index abd4abf1d..ea4145ca3 100644 --- a/include/boost/compute/algorithm/detail/copy_to_host.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_host.hpp @@ -153,7 +153,7 @@ inline future copy_to_host_async(svm_ptr first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future(); } event event_ = queue.enqueue_svm_memcpy_async( From aaaaca4ed23c06705268ed69a6ff5107a933543a Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 24 May 2016 20:53:40 +0200 Subject: [PATCH 12/21] Type-safe async coping from device to host --- include/boost/compute/algorithm/copy.hpp | 72 +++++++++++++++++++++++- test/test_copy_type_mismatch.cpp | 32 +++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index a25aff469..799d630f3 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -576,9 +576,14 @@ dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - is_device_iterator::value && - !is_device_iterator::value + typename boost::enable_if< + mpl::and_< + is_device_iterator, + mpl::not_< + is_device_iterator + >, + is_same_value_type + > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( @@ -589,6 +594,67 @@ dispatch_copy_async(InputIterator first, return copy_to_host_async(first, last, result, queue); } +// device -> host (async) +// Type mismatch between InputIterator and OutputIterator value_types +template +inline future +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator, + mpl::not_< + is_device_iterator + >, + mpl::not_< + is_same_value_type + > + > + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator::value, + "copy_async() is only supported for contiguous host iterators" + ); + + typedef typename std::iterator_traits::value_type output_type; + const context &context = queue.get_context(); + size_t count = iterator_range_size(first, last); + + // map host memory to device + buffer mapped_host( + context, + count * sizeof(output_type), + buffer::write_only | buffer::use_host_ptr, + static_cast( + ::boost::addressof(*result) + ) + ); + // copy async on device + ::boost::compute::future > future = + copy_on_device_async( + first, + last, + make_buffer_iterator(mapped_host), + queue + ); + // update host memory asynchronously by maping and unmaping memory + event map_event; + void* ptr = queue.enqueue_map_buffer_async( + mapped_host, + CL_MAP_READ, + 0, + count * sizeof(output_type), + map_event, + future.get_event() + ); + event unmap_event = + queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); + return make_future(result + count, unmap_event); +} + // device -> device template inline OutputIterator diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index a180869fd..f56e8c030 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -666,5 +666,37 @@ BOOST_AUTO_TEST_CASE(copy_to_host_list_float_to_int_covert_on_host) parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); } +BOOST_AUTO_TEST_CASE(copy_async_to_host_float_to_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + std::vector host_vector(device_vector.size()); + + // copy device float vector to host int vector + compute::future future = + bc::copy_async( + device_vector.begin(), + device_vector.end(), + host_vector.begin(), + queue + ); + future.wait(); + + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + BOOST_AUTO_TEST_SUITE_END() From 933ff15ce55be17d263ea22f8692a1e00c8a2ba4 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 24 May 2016 21:46:38 +0200 Subject: [PATCH 13/21] Better names for tests --- test/test_copy_type_mismatch.cpp | 146 ++++++++++++++++--------------- 1 file changed, 75 insertions(+), 71 deletions(-) diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index f56e8c030..34749940d 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -70,7 +70,7 @@ BOOST_AUTO_TEST_CASE(is_same_ignore_const) )); } -BOOST_AUTO_TEST_CASE(copy_host_float_to_device_double) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_double) { if(!device.supports_extension("cl_khr_fp64")) { std::cout << "skipping test: device does not support double" << std::endl; @@ -88,7 +88,7 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_double) CHECK_RANGE_EQUAL(double_, 4, device_vector, (6.1f, 10.2f, 19.3f, 25.4f)); } -BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int) { using compute::int_; using compute::float_; @@ -111,7 +111,9 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int) ); } -BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_mapping_device_vector) +// HOST -> DEVICE + +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_mapping_device_vector) { using compute::int_; using compute::uint_; @@ -150,7 +152,7 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_mapping_device_vector) parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); } -BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_convert_on_host) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_convert_on_host) { using compute::int_; using compute::uint_; @@ -194,7 +196,7 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_convert_on_host) parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); } -BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_with_transform) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_with_transform) { using compute::int_; using compute::uint_; @@ -238,7 +240,7 @@ BOOST_AUTO_TEST_CASE(copy_host_float_to_device_int_with_transform) parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); } -BOOST_AUTO_TEST_CASE(copy_async_host_float_to_device_int) +BOOST_AUTO_TEST_CASE(copy_async_to_device_float_to_int) { using compute::int_; using compute::float_; @@ -264,72 +266,10 @@ BOOST_AUTO_TEST_CASE(copy_async_host_float_to_device_int) ); } -BOOST_AUTO_TEST_CASE(copy_device_float_to_device_int) -{ - using compute::int_; - using compute::float_; - - float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; - bc::vector device_fvector(data, data + 4, queue); - bc::vector device_ivector(4, context); - - // copy device float vector to device int vector - bc::copy( - device_fvector.begin(), - device_fvector.end(), - device_ivector.begin(), - queue - ); - - CHECK_RANGE_EQUAL( - int_, - 4, - device_ivector, - ( - static_cast(6.1f), - static_cast(-10.2f), - static_cast(19.3f), - static_cast(25.4f) - ) - ); -} - -BOOST_AUTO_TEST_CASE(copy_async_device_float_to_device_int) -{ - using compute::int_; - using compute::float_; - - float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; - bc::vector device_fvector(data, data + 4, queue); - bc::vector device_ivector(4, context); - - // copy device float vector to device int vector - compute::future future = - bc::copy_async( - device_fvector.begin(), - device_fvector.end(), - device_ivector.begin(), - queue - ); - future.wait(); - - CHECK_RANGE_EQUAL( - int_, - 4, - device_ivector, - ( - static_cast(6.1f), - static_cast(-10.2f), - static_cast(19.3f), - static_cast(25.4f) - ) - ); -} - // Test copying from a std::list to a bc::vector. This differs from // the test copying from std::vector because std::list has non-contiguous // storage for its data values. -BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_map) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_list_device_map) { using compute::int_; using compute::uint_; @@ -372,7 +312,7 @@ BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_map) // Test copying from a std::list to a bc::vector. This differs from // the test copying from std::vector because std::list has non-contiguous // storage for its data values. -BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) +BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_list_convert_on_host) { using compute::int_; using compute::uint_; @@ -418,9 +358,73 @@ BOOST_AUTO_TEST_CASE(copy_from_host_float_list_to_int_device_convert_on_host) } +// DEVICE -> DEVICE + +BOOST_AUTO_TEST_CASE(copy_on_device_float_to_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_fvector(data, data + 4, queue); + bc::vector device_ivector(4, context); + + // copy device float vector to device int vector + bc::copy( + device_fvector.begin(), + device_fvector.end(), + device_ivector.begin(), + queue + ); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_ivector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + +BOOST_AUTO_TEST_CASE(copy_async_on_device_float_to_int) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_fvector(data, data + 4, queue); + bc::vector device_ivector(4, context); + + // copy device float vector to device int vector + compute::future future = + bc::copy_async( + device_fvector.begin(), + device_fvector.end(), + device_ivector.begin(), + queue + ); + future.wait(); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_ivector, + ( + static_cast(6.1f), + static_cast(-10.2f), + static_cast(19.3f), + static_cast(25.4f) + ) + ); +} + // DEVICE -> HOST -BOOST_AUTO_TEST_CASE(copy_device_float_to_host_int) +BOOST_AUTO_TEST_CASE(copy_to_host_float_to_int) { using compute::int_; using compute::float_; From 4280024f2693458b45f5f5c0652413a102d5084d Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 26 May 2016 15:53:20 +0200 Subject: [PATCH 14/21] Now svm_ptr keeps its context This commit modifies svm_ptr to keep its context. It is convenient for the users and enables creating svm_ptr_index_expr class. --- include/boost/compute/memory/svm_ptr.hpp | 103 +++++++++++++++++++++-- include/boost/compute/svm.hpp | 14 ++- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/include/boost/compute/memory/svm_ptr.hpp b/include/boost/compute/memory/svm_ptr.hpp index 2dbcb8f63..0c9d88035 100644 --- a/include/boost/compute/memory/svm_ptr.hpp +++ b/include/boost/compute/memory/svm_ptr.hpp @@ -11,12 +11,67 @@ #ifndef BOOST_COMPUTE_MEMORY_SVM_PTR_HPP #define BOOST_COMPUTE_MEMORY_SVM_PTR_HPP +#include +#include +#include + #include +#include +#include +#include #include namespace boost { namespace compute { +// forward declaration for svm_ptr +template +class svm_ptr; + +// svm functions require OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +namespace detail { + +template +struct svm_ptr_index_expr +{ + typedef T result_type; + + svm_ptr_index_expr(const svm_ptr &svm_ptr, + const IndexExpr &expr) + : m_svm_ptr(svm_ptr), + m_expr(expr) + { + } + + operator T() const + { + BOOST_STATIC_ASSERT_MSG(boost::is_integral::value, + "Index expression must be integral"); + + BOOST_ASSERT(m_svm_ptr.get()); + + const context &context = m_svm_ptr.get_context(); + const device &device = context.get_device(); + command_queue queue(context, device); + + T value; + T* ptr = + static_cast(m_svm_ptr.get()) + static_cast(m_expr); + queue.enqueue_svm_map(static_cast(ptr), sizeof(T), CL_MAP_READ); + value = *(ptr); + queue.enqueue_svm_unmap(static_cast(ptr)).wait(); + + return value; + } + + const svm_ptr &m_svm_ptr; + IndexExpr m_expr; +}; + +} // end detail namespace +#endif + template class svm_ptr { @@ -32,19 +87,22 @@ class svm_ptr { } - explicit svm_ptr(void *ptr) - : m_ptr(static_cast(ptr)) + svm_ptr(void *ptr, const context &context) + : m_ptr(static_cast(ptr)), + m_context(context) { } svm_ptr(const svm_ptr &other) - : m_ptr(other.m_ptr) + : m_ptr(other.m_ptr), + m_context(other.m_context) { } - svm_ptr& operator=(const svm_ptr &other) + svm_ptr& operator=(const svm_ptr &other) { m_ptr = other.m_ptr; + m_context = other.m_context; return *this; } @@ -59,18 +117,53 @@ class svm_ptr svm_ptr operator+(difference_type n) { - return svm_ptr(m_ptr + n); + return svm_ptr(m_ptr + n, m_context); } difference_type operator-(svm_ptr other) { + BOOST_ASSERT(other.m_context == m_context); return m_ptr - other.m_ptr; } + context& get_context() const + { + return m_context; + } + + // svm functions require OpenCL 2.0 + #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) + /// \internal_ + template + detail::svm_ptr_index_expr + operator[](const Expr &expr) const + { + BOOST_ASSERT(m_ptr); + + return detail::svm_ptr_index_expr(*this, + expr); + } + #endif + private: T *m_ptr; + context m_context; }; +namespace detail { + +/// \internal_ +template +struct set_kernel_arg > +{ + void operator()(kernel &kernel_, size_t index, const svm_ptr &ptr) + { + kernel_.set_arg_svm_ptr(index, ptr.get()); + } +}; + +} // end detail namespace + /// \internal_ (is_device_iterator specialization for svm_ptr) template struct is_device_iterator > : boost::true_type {}; diff --git a/include/boost/compute/svm.hpp b/include/boost/compute/svm.hpp index d03c8d907..4bc3a7423 100644 --- a/include/boost/compute/svm.hpp +++ b/include/boost/compute/svm.hpp @@ -15,7 +15,7 @@ #include #include -// svm functions require opencl 2.0 +// svm functions require OpenCL 2.0 #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) namespace boost { @@ -34,7 +34,10 @@ inline svm_ptr svm_alloc(const context &context, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, unsigned int alignment = 0) { - svm_ptr ptr(clSVMAlloc(context.get(), flags, size * sizeof(T), alignment)); + svm_ptr ptr( + clSVMAlloc(context.get(), flags, size * sizeof(T), alignment), + context + ); if(!ptr.get()){ BOOST_THROW_EXCEPTION(opencl_error(CL_MEM_OBJECT_ALLOCATION_FAILURE)); } @@ -49,6 +52,13 @@ inline svm_ptr svm_alloc(const context &context, /// /// \see svm_alloc(), command_queue::enqueue_svm_free() template +inline void svm_free(svm_ptr ptr) +{ + clSVMFree(ptr.get_context(), ptr.get()); +} + +/// \overload +template inline void svm_free(const context &context, svm_ptr ptr) { clSVMFree(context.get(), ptr.get()); From 753f88332100faea1eb6ded2af16125d452812b8 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Fri, 27 May 2016 14:33:49 +0200 Subject: [PATCH 15/21] Support for svm_ptr in meta_kernel --- include/boost/compute/detail/meta_kernel.hpp | 89 +++++++++++++++++++- include/boost/compute/kernel.hpp | 6 +- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/include/boost/compute/detail/meta_kernel.hpp b/include/boost/compute/detail/meta_kernel.hpp index 7be778b02..5e6d6e033 100644 --- a/include/boost/compute/detail/meta_kernel.hpp +++ b/include/boost/compute/detail/meta_kernel.hpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -203,6 +204,28 @@ struct meta_kernel_buffer_info size_t index; }; +struct meta_kernel_svm_info +{ + template + meta_kernel_svm_info(const svm_ptr ptr, + const std::string &id, + memory_object::address_space addr_space, + size_t i) + : ptr(ptr.get()), + identifier(id), + address_space(addr_space), + index(i) + { + + } + + void* ptr; + std::string identifier; + memory_object::address_space address_space; + size_t index; +}; + + class meta_kernel; template @@ -280,12 +303,14 @@ class meta_kernel meta_kernel(const meta_kernel &other) { m_source.str(other.m_source.str()); + m_options = other.m_options; } meta_kernel& operator=(const meta_kernel &other) { if(this != &other){ m_source.str(other.m_source.str()); + m_options = other.m_options; } return *this; @@ -342,9 +367,11 @@ class meta_kernel boost::shared_ptr cache = program_cache::get_global_cache(context); + std::string compile_options = m_options + options; + // load (or build) program from cache ::boost::compute::program program = - cache->get_or_build(cache_key, options, source, context); + cache->get_or_build(cache_key, compile_options, source, context); // create kernel ::boost::compute::kernel kernel = program.create_kernel(name()); @@ -365,6 +392,13 @@ class meta_kernel kernel.set_arg(bi.index, bi.m_mem); } + // bind svm args + for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){ + const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i]; + + kernel.set_arg_svm_ptr(spi.index, spi.ptr); + } + return kernel; } @@ -689,6 +723,45 @@ class meta_kernel return identifier; } + template + std::string get_svm_identifier(const svm_ptr &svm_ptr, + const memory_object::address_space address_space = + memory_object::global_memory) + { + BOOST_ASSERT( + (address_space == memory_object::global_memory) + || (address_space == memory_object::constant_memory) + ); + + // check if we've already seen this pointer + for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){ + const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i]; + + if(spi.ptr == svm_ptr.get() && + spi.address_space == address_space){ + return spi.identifier; + } + } + + // create a new binding + std::string identifier = + "_svm_ptr" + lexical_cast(m_stored_svm_ptrs.size()); + size_t index = add_arg(address_space, identifier); + + if(m_stored_svm_ptrs.empty()) { + m_options += std::string(" -cl-std=CL2.0"); + } + + // store new svm pointer info + m_stored_svm_ptrs.push_back( + detail::meta_kernel_svm_info( + svm_ptr, identifier, address_space, index + ) + ); + + return identifier; + } + std::string get_image_identifier(const char *qualifiers, const image2d &image) { size_t index = add_arg_with_qualifiers(qualifiers, "image"); @@ -880,8 +953,10 @@ class meta_kernel std::set m_external_function_names; std::vector m_args; std::string m_pragmas; + std::string m_options; std::vector m_stored_args; std::vector m_stored_buffers; + std::vector m_stored_svm_ptrs; }; template @@ -960,6 +1035,18 @@ inline meta_kernel& operator<<(meta_kernel &kernel, } } +// SVM requires OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +template +inline meta_kernel& operator<<(meta_kernel &kernel, + const svm_ptr_index_expr &expr) +{ + return kernel << + kernel.get_svm_identifier(expr.m_svm_ptr) << + '[' << expr.m_expr << ']'; +} +#endif + template inline meta_kernel& operator<<(meta_kernel &kernel, const invoked_unary_negate_function #include #include -#include namespace boost { namespace compute { @@ -263,11 +262,10 @@ class kernel } /// \internal_ - template - void set_arg(size_t index, const svm_ptr ptr) + void set_arg_svm_ptr(size_t index, void* ptr) { #ifdef CL_VERSION_2_0 - cl_int ret = clSetKernelArgSVMPointer(m_kernel, index, ptr.get()); + cl_int ret = clSetKernelArgSVMPointer(m_kernel, index, ptr); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } From 5c29b50823f9e37b098f6a7ad14b7b1149e38045 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 26 May 2016 13:05:58 +0200 Subject: [PATCH 16/21] Coping SVM to/from host by mapping SVM memory --- .../algorithm/detail/copy_to_device.hpp | 23 ++++++++++++++++ .../compute/algorithm/detail/copy_to_host.hpp | 27 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/include/boost/compute/algorithm/detail/copy_to_device.hpp b/include/boost/compute/algorithm/detail/copy_to_device.hpp index ec058c18d..2fded6131 100644 --- a/include/boost/compute/algorithm/detail/copy_to_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_device.hpp @@ -161,6 +161,29 @@ inline future > copy_to_device_async(HostIterator first, return make_future(result + count, event_); } + +template +inline svm_ptr copy_to_device_map(HostIterator first, + HostIterator last, + svm_ptr result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + // map + queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE); + + // copy [first; last) to result buffer + std::copy(first, last, static_cast(result.get())); + + // unmap result + queue.enqueue_svm_unmap(result.get()).wait(); + + return result + count; +} #endif // CL_VERSION_2_0 } // end detail namespace diff --git a/include/boost/compute/algorithm/detail/copy_to_host.hpp b/include/boost/compute/algorithm/detail/copy_to_host.hpp index ea4145ca3..0e4099f34 100644 --- a/include/boost/compute/algorithm/detail/copy_to_host.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_host.hpp @@ -162,6 +162,33 @@ inline future copy_to_host_async(svm_ptr first, return make_future(iterator_plus_distance(result, count), event_); } + +template +inline HostIterator copy_to_host_map(svm_ptr first, + svm_ptr last, + HostIterator result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + // map + queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ); + + // copy [first; last) to result + std::copy( + static_cast(first.get()), + static_cast(last.get()), + result + ); + + // unmap [first; last) + queue.enqueue_svm_unmap(first.get()).wait(); + + return iterator_plus_distance(result, count); +} #endif // CL_VERSION_2_0 } // end detail namespace From ba0da3f3f47aa950bd31b9e0b9303acf275d8e3e Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Fri, 27 May 2016 14:41:10 +0200 Subject: [PATCH 17/21] Tests for copying SVM memory when types mismatch Tests for copying SVM memory to/from/on device when value_types of InputIterator and OutputIterator mismatch. --- test/test_copy_type_mismatch.cpp | 578 +++++++++++++++++++++++++++++++ 1 file changed, 578 insertions(+) diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index 34749940d..7b622d617 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "quirks.hpp" @@ -357,6 +358,194 @@ BOOST_AUTO_TEST_CASE(copy_to_device_float_to_int_list_convert_on_host) parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); } +// SVM requires OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +BOOST_AUTO_TEST_CASE(copy_to_device_svm_float_to_int_map) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_device_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + float_ host[] = { 5.1f, -10.3f, 19.4f, 26.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int device vector + bc::copy(host, host + 4, ptr, queue); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(5.1f), + static_cast(-10.3f), + static_cast(19.4f), + static_cast(26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_to_device_svm_float_to_int_convert_on_host) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + if(bug_in_svmmemcpy(device)){ + std::cerr << "skipping svmmemcpy test case" << std::endl; + return; + } + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by casting input data on host and performing + // normal copy host->device (since types match now) + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); + + float_ host[] = { 0.1f, 10.3f, 9.4f, -26.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int device vector + bc::copy(host, host + 4, ptr, queue); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(0.1f), + static_cast(10.3f), + static_cast(9.4f), + static_cast(-26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_to_device_svm_float_to_int_with_transform) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_device_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by mapping input data to the device memory + // and using transform operation (copy kernel) for casting & copying + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 0); + + float_ host[] = { 4.1f, -11.3f, 219.4f, -26.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int device vector + bc::copy(host, host + 4, ptr, queue); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(4.1f), + static_cast(-11.3f), + static_cast(219.4f), + static_cast(-26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_async_to_device_svm_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::uint_; + using compute::float_; + + float_ host[] = { 44.1f, -14.3f, 319.4f, -26.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int device vector + compute::future future = + bc::copy_async(host, host + 4, ptr, queue); + future.wait(); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(44.1f), + static_cast(-14.3f), + static_cast(319.4f), + static_cast(-26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); +} +#endif // DEVICE -> DEVICE @@ -422,6 +611,223 @@ BOOST_AUTO_TEST_CASE(copy_async_on_device_float_to_int) ); } +// SVM requires OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +BOOST_AUTO_TEST_CASE(copy_on_device_buffer_to_svm_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 65.1f, -110.2f, -19.3f, 26.7f }; + bc::vector device_vector(data, data + 4, queue); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int svm memory + bc::copy(device_vector.begin(), device_vector.end(), ptr, queue); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(65.1f), + static_cast(-110.2f), + static_cast(-19.3f), + static_cast(26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); +} + +BOOST_AUTO_TEST_CASE(copy_on_device_svm_to_buffer_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, 11.2f, 19.3f, 6.7f }; + bc::vector device_vector(4, context); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm data to int device vector + bc::copy(ptr, ptr + 4, device_vector.begin(), queue); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(6.1f), + static_cast(11.2f), + static_cast(19.3f), + static_cast(6.7f) + ) + ); + + compute::svm_free(context, ptr); +} + +BOOST_AUTO_TEST_CASE(copy_on_device_svm_to_svm_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 0.1f, -10.2f, -1.3f, 2.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + compute::svm_ptr ptr2 = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm to int svm + bc::copy(ptr, ptr + 4, ptr2, queue); + + queue.enqueue_svm_map(ptr2.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr2.get()), + ( + static_cast(0.1f), + static_cast(-10.2f), + static_cast(-1.3f), + static_cast(2.7f) + ) + ); + queue.enqueue_svm_unmap(ptr2.get()).wait(); + + compute::svm_free(context, ptr); + compute::svm_free(context, ptr2); +} + +BOOST_AUTO_TEST_CASE(copy_async_on_device_buffer_to_svm_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 65.1f, -110.2f, -19.3f, 26.7f }; + bc::vector device_vector(data, data + 4, queue); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + // copy host float data to int svm memory + compute::future > future = + bc::copy_async(device_vector.begin(), device_vector.end(), ptr, queue); + future.wait(); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr.get()), + ( + static_cast(65.1f), + static_cast(-110.2f), + static_cast(-19.3f), + static_cast(26.7f) + ) + ); + queue.enqueue_svm_unmap(ptr.get()).wait(); + + compute::svm_free(context, ptr); +} + +BOOST_AUTO_TEST_CASE(copy_async_on_device_svm_to_buffer_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 65.1f, -110.2f, -19.3f, 26.7f }; + bc::vector device_vector(4, context); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm data to int device vector + compute::future::iterator > future = + bc::copy_async(ptr, ptr + 4, device_vector.begin(), queue); + future.wait(); + + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + static_cast(65.1f), + static_cast(-110.2f), + static_cast(-19.3f), + static_cast(26.7f) + ) + ); + + compute::svm_free(context, ptr); +} + +BOOST_AUTO_TEST_CASE(copy_async_on_device_svm_to_svm_float_to_int) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::float_; + + float_ data[] = { 0.1f, -10.2f, -1.3f, 2.7f }; + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + compute::svm_ptr ptr2 = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm to int svm + compute::future > future = + bc::copy_async(ptr, ptr + 4, ptr2, queue); + future.wait(); + + queue.enqueue_svm_map(ptr2.get(), 4 * sizeof(cl_int), CL_MAP_READ); + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + static_cast(ptr2.get()), + ( + static_cast(0.1f), + static_cast(-10.2f), + static_cast(-1.3f), + static_cast(2.7f) + ) + ); + queue.enqueue_svm_unmap(ptr2.get()).wait(); + + compute::svm_free(context, ptr); + compute::svm_free(context, ptr2); +} +#endif + // DEVICE -> HOST BOOST_AUTO_TEST_CASE(copy_to_host_float_to_int) @@ -702,5 +1108,177 @@ BOOST_AUTO_TEST_CASE(copy_async_to_host_float_to_int) ); } +// SVM requires OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +BOOST_AUTO_TEST_CASE(copy_to_host_svm_float_to_int_map) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + + // force copy_to_host_map (mapping device vector to the host) + parameters->set(cache_key, "map_copy_threshold", 1024); + + float_ data[] = { 6.1f, 1.2f, 1.3f, -66.7f }; + std::vector host_vector(4, 0); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm data to int host vector + bc::copy(ptr, ptr + 4, host_vector.begin(), queue); + + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(1.2f), + static_cast(1.3f), + static_cast(-66.7f) + ) + ); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_to_host_svm_float_to_int_convert_on_host) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + if(bug_in_svmmemcpy(device)){ + std::cerr << "skipping svmmemcpy test case" << std::endl; + return; + } + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by copying input device vector to temporary + // host vector of the same type and then copying from that temporary + // vector to result using std::copy() + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 1024); + + float_ data[] = { 6.1f, 1.2f, 1.3f, 766.7f }; + std::vector host_vector(4, 0); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm data to int host vector + bc::copy(ptr, ptr + 4, host_vector.begin(), queue); + + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(6.1f), + static_cast(1.2f), + static_cast(1.3f), + static_cast(766.7f) + ) + ); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} + +BOOST_AUTO_TEST_CASE(copy_to_host_svm_float_to_int_transform) +{ + REQUIRES_OPENCL_VERSION(2, 0); + + using compute::int_; + using compute::uint_; + using compute::float_; + + std::string cache_key = + std::string("__boost_compute_copy_to_host_float_int"); + boost::shared_ptr parameters = + bc::detail::parameter_cache::get_global_cache(device); + + // save + uint_ map_copy_threshold = + parameters->get(cache_key, "map_copy_threshold", 0); + uint_ direct_copy_threshold = + parameters->get(cache_key, "direct_copy_threshold", 0); + + // force copying by copying input device vector to temporary + // host vector of the same type and then copying from that temporary + // vector to result using std::copy() + parameters->set(cache_key, "map_copy_threshold", 0); + parameters->set(cache_key, "direct_copy_threshold", 0); + + float_ data[] = { 0.1f, 11.2f, 1.3f, -66.7f }; + std::vector host_vector(4, 0); + compute::svm_ptr ptr = compute::svm_alloc(context, 4); + + queue.enqueue_svm_map(ptr.get(), 4 * sizeof(cl_int), CL_MAP_WRITE); + for(size_t i = 0; i < 4; i++) { + static_cast(ptr.get())[i] = data[i]; + } + queue.enqueue_svm_unmap(ptr.get()).wait(); + + // copy host float svm data to int host vector + bc::copy(ptr, ptr + 4, host_vector.begin(), queue); + + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + static_cast(0.1f), + static_cast(11.2f), + static_cast(1.3f), + static_cast(-66.7f) + ) + ); + + compute::svm_free(context, ptr); + + // restore + parameters->set(cache_key, "map_copy_threshold", map_copy_threshold); + parameters->set(cache_key, "direct_copy_threshold", direct_copy_threshold); +} +#endif BOOST_AUTO_TEST_SUITE_END() From 69e09f27d8337ae5f5ec26742efcc318ca42e90c Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 28 May 2016 13:14:53 +0200 Subject: [PATCH 18/21] Minor fixes in test_copy.cpp --- test/test_copy.cpp | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/test/test_copy.cpp b/test/test_copy.cpp index c3370b9d2..1279602af 100644 --- a/test/test_copy.cpp +++ b/test/test_copy.cpp @@ -41,33 +41,37 @@ namespace compute = boost::compute; BOOST_AUTO_TEST_CASE(copy_on_device) { float data[] = { 6.1f, 10.2f, 19.3f, 25.4f }; - bc::vector a(4); + bc::vector a(4, context); bc::copy(data, data + 4, a.begin(), queue); CHECK_RANGE_EQUAL(float, 4, a, (6.1f, 10.2f, 19.3f, 25.4f)); - bc::vector b(4); + bc::vector b(4, context); bc::fill(b.begin(), b.end(), 0, queue); CHECK_RANGE_EQUAL(float, 4, b, (0.0f, 0.0f, 0.0f, 0.0f)); bc::copy(a.begin(), a.end(), b.begin(), queue); CHECK_RANGE_EQUAL(float, 4, b, (6.1f, 10.2f, 19.3f, 25.4f)); + + bc::vector c(context); + bc::copy(c.begin(), c.end(), b.begin(), queue); + CHECK_RANGE_EQUAL(float, 4, b, (6.1f, 10.2f, 19.3f, 25.4f)); } BOOST_AUTO_TEST_CASE(copy_on_device_device_ptr) { float data[] = { 6.1f, 10.2f, 19.3f, 25.4f }; - bc::vector a(4); + bc::vector a(4, context); bc::copy(data, data + 4, a.begin(), queue); CHECK_RANGE_EQUAL(float, 4, a, (6.1f, 10.2f, 19.3f, 25.4f)); - bc::vector b(4); + bc::vector b(4, context); bc::detail::device_ptr b_ptr(b.get_buffer(), size_t(0)); // buffer_iterator -> device_ptr bc::copy(a.begin(), a.end(), b_ptr, queue); CHECK_RANGE_EQUAL(float, 4, b, (6.1f, 10.2f, 19.3f, 25.4f)); - bc::vector c(4); + bc::vector c(4, context); bc::fill(c.begin(), c.end(), 0.0f, queue); bc::detail::device_ptr c_ptr(c.get_buffer(), size_t(2)); @@ -103,6 +107,29 @@ BOOST_AUTO_TEST_CASE(copy) BOOST_CHECK_EQUAL(host_vector[3], 6); } +BOOST_AUTO_TEST_CASE(empty_copy) +{ + int data[] = { 1, 2, 5, 6 }; + bc::vector a(4, context); + bc::vector b(context); + std::vector c; + + bc::copy(data, data + 4, a.begin(), queue); + CHECK_RANGE_EQUAL(int, 4, a, (1, 2, 5, 6)); + + bc::copy(b.begin(), b.end(), a.begin(), queue); + CHECK_RANGE_EQUAL(int, 4, a, (1, 2, 5, 6)); + + bc::copy(c.begin(), c.end(), a.begin(), queue); + CHECK_RANGE_EQUAL(int, 4, a, (1, 2, 5, 6)); + + bc::future::iterator> future = + bc::copy_async(c.begin(), c.end(), a.begin(), queue); + if(future.valid()) + future.wait(); + CHECK_RANGE_EQUAL(int, 4, a, (1, 2, 5, 6)); +} + // Test copying from a std::list to a bc::vector. This differs from // the test copying from std::vector because std::list has non-contigous // storage for its data values. From c9a0aba172c022cdf0b1d6c4783f70df0aea999e Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 28 May 2016 22:00:30 +0200 Subject: [PATCH 19/21] Remove debug macro, fix names of event variables --- example/opencv_histogram.cpp | 4 ---- include/boost/compute/algorithm/copy.hpp | 2 +- include/boost/compute/algorithm/detail/copy_to_device.hpp | 4 ++-- include/boost/compute/algorithm/detail/copy_to_host.hpp | 4 ++-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/example/opencv_histogram.cpp b/example/opencv_histogram.cpp index 788098e24..e339030ba 100644 --- a/example/opencv_histogram.cpp +++ b/example/opencv_histogram.cpp @@ -11,10 +11,6 @@ //Code sample for calculating histogram using OpenCL and //displaying image histogram in OpenCV. -#ifndef BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION - #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION -#endif - #include #include diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index 799d630f3..89829bf1a 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -556,7 +556,7 @@ dispatch_copy(InputIterator first, make_buffer_iterator(mapped_host), queue ); - // update host memory asynchronously by maping and unmaping memory + // synchronously update host memory by mapping and unmapping memory event map_event; void* ptr = queue.enqueue_map_buffer_async( mapped_host, diff --git a/include/boost/compute/algorithm/detail/copy_to_device.hpp b/include/boost/compute/algorithm/detail/copy_to_device.hpp index 2fded6131..bce5975f5 100644 --- a/include/boost/compute/algorithm/detail/copy_to_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_device.hpp @@ -86,11 +86,11 @@ inline DeviceIterator copy_to_device_map(HostIterator first, std::copy(first, last, pointer); // unmap result buffer - boost::compute::event unmapEvent = queue.enqueue_unmap_buffer( + boost::compute::event unmap_event = queue.enqueue_unmap_buffer( result.get_buffer(), static_cast(pointer) ); - unmapEvent.wait(); + unmap_event.wait(); return result + static_cast(count); } diff --git a/include/boost/compute/algorithm/detail/copy_to_host.hpp b/include/boost/compute/algorithm/detail/copy_to_host.hpp index 0e4099f34..d770a996e 100644 --- a/include/boost/compute/algorithm/detail/copy_to_host.hpp +++ b/include/boost/compute/algorithm/detail/copy_to_host.hpp @@ -89,11 +89,11 @@ inline HostIterator copy_to_host_map(DeviceIterator first, ); // unmap [first; last) - boost::compute::event unmapEvent = queue.enqueue_unmap_buffer( + boost::compute::event unmap_event = queue.enqueue_unmap_buffer( first.get_buffer(), static_cast(pointer) ); - unmapEvent.wait(); + unmap_event.wait(); return iterator_plus_distance(result, count); } From bffc9f74fba86119ea45c46420c19f37be46a4ba Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 28 May 2016 22:46:35 +0200 Subject: [PATCH 20/21] Fix async copying when first == last (input is empty) --- include/boost/compute/algorithm/copy.hpp | 22 ++++-- test/test_copy_type_mismatch.cpp | 96 ++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index 89829bf1a..ba929c6dd 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -179,8 +179,8 @@ dispatch_copy(InputIterator first, ); // select copy method based on thresholds & input_size_bytes - size_t input_size = iterator_range_size(first, last); - size_t input_size_bytes = input_size * sizeof(input_type); + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); // [0; map_copy_threshold) -> copy_to_device_map() if(input_size_bytes < map_copy_threshold) { @@ -195,13 +195,14 @@ dispatch_copy(InputIterator first, // [direct_copy_threshold; inf) -> map [first; last) to device and // run copy kernel on device for copying & casting + // At this point we are sure that count > 1 (first != last). ::boost::compute::mapped_view mapped_host( // make sure it's a pointer to constant data // to force read only mapping const_cast( ::boost::addressof(*first) ), - input_size, + count, context ); return copy_on_device(mapped_host.begin(), mapped_host.end(), result, queue); @@ -335,7 +336,11 @@ dispatch_copy_async(InputIterator first, typedef typename std::iterator_traits::value_type input_type; const context &context = queue.get_context(); - size_t input_size = iterator_range_size(first, last); + size_t count = iterator_range_size(first, last); + + if(count < size_t(1)) { + return future(); + } // map [first; last) to device and run copy kernel // on device for copying & casting @@ -345,7 +350,7 @@ dispatch_copy_async(InputIterator first, const_cast( ::boost::addressof(*first) ), - input_size, + count, context ); return copy_on_device_async( @@ -541,7 +546,8 @@ dispatch_copy(InputIterator first, // [direct_copy_threshold; inf) -> map [result; result + input_size) to // device and run copy kernel on device for copying & casting - // map host memory to device + // map host memory to device. + // At this point we are sure that count > 1 (first != last). buffer mapped_host( context, count * sizeof(output_type), @@ -623,6 +629,10 @@ dispatch_copy_async(InputIterator first, const context &context = queue.get_context(); size_t count = iterator_range_size(first, last); + if(count < size_t(1)) { + return future(); + } + // map host memory to device buffer mapped_host( context, diff --git a/test/test_copy_type_mismatch.cpp b/test/test_copy_type_mismatch.cpp index 7b622d617..8e97fcd6b 100644 --- a/test/test_copy_type_mismatch.cpp +++ b/test/test_copy_type_mismatch.cpp @@ -267,6 +267,34 @@ BOOST_AUTO_TEST_CASE(copy_async_to_device_float_to_int) ); } +BOOST_AUTO_TEST_CASE(copy_async_to_device_float_to_int_empty) +{ + using compute::int_; + using compute::float_; + + float_ host[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(size_t(4), int_(1), queue); + + // copy nothing to int device vector + compute::future::iterator > future = + bc::copy_async(host, host, device_vector.begin(), queue); + if(future.valid()) { + future.wait(); + } + + CHECK_RANGE_EQUAL( + int_, + 4, + device_vector, + ( + int_(1), + int_(1), + int_(1), + int_(1) + ) + ); +} + // Test copying from a std::list to a bc::vector. This differs from // the test copying from std::vector because std::list has non-contiguous // storage for its data values. @@ -611,6 +639,40 @@ BOOST_AUTO_TEST_CASE(copy_async_on_device_float_to_int) ); } +BOOST_AUTO_TEST_CASE(copy_async_on_device_float_to_int_empty) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_fvector(data, data + 4, queue); + bc::vector device_ivector(size_t(4), int_(1), queue); + + // copy device float vector to device int vector + compute::future future = + bc::copy_async( + device_fvector.begin(), + device_fvector.begin(), + device_ivector.begin(), + queue + ); + if(future.valid()) { + future.wait(); + } + + CHECK_RANGE_EQUAL( + int_, + 4, + device_ivector, + ( + int_(1), + int_(1), + int_(1), + int_(1) + ) + ); +} + // SVM requires OpenCL 2.0 #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) BOOST_AUTO_TEST_CASE(copy_on_device_buffer_to_svm_float_to_int) @@ -1108,6 +1170,40 @@ BOOST_AUTO_TEST_CASE(copy_async_to_host_float_to_int) ); } +BOOST_AUTO_TEST_CASE(copy_async_to_host_float_to_int_empty) +{ + using compute::int_; + using compute::float_; + + float_ data[] = { 6.1f, -10.2f, 19.3f, 25.4f }; + bc::vector device_vector(data, data + 4, queue); + std::vector host_vector(device_vector.size(), int_(1)); + + // copy device float vector to host int vector + compute::future future = + bc::copy_async( + device_vector.begin(), + device_vector.begin(), + host_vector.begin(), + queue + ); + if(future.valid()) { + future.wait(); + } + + CHECK_HOST_RANGE_EQUAL( + int_, + 4, + host_vector.begin(), + ( + int_(1), + int_(1), + int_(1), + int_(1) + ) + ); +} + // SVM requires OpenCL 2.0 #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) BOOST_AUTO_TEST_CASE(copy_to_host_svm_float_to_int_map) From 8f4db3d7b7d86f4e6894d74dce5f40d6944c1277 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 28 May 2016 22:54:51 +0200 Subject: [PATCH 21/21] Reuse dispatch_copy_async() in dispatch_copy() --- include/boost/compute/algorithm/copy.hpp | 278 ++++++++++------------- 1 file changed, 126 insertions(+), 152 deletions(-) diff --git a/include/boost/compute/algorithm/copy.hpp b/include/boost/compute/algorithm/copy.hpp index ba929c6dd..7779277b8 100644 --- a/include/boost/compute/algorithm/copy.hpp +++ b/include/boost/compute/algorithm/copy.hpp @@ -96,6 +96,81 @@ struct is_bool_value_type : bool >::type {}; +// host -> device (async) +template +inline future +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + is_same_value_type + > + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator::value, + "copy_async() is only supported for contiguous host iterators" + ); + + return copy_to_device_async(first, last, result, queue); +} + +// host -> device (async) +// Type mismatch between InputIterator and OutputIterator value_types +template +inline future +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator + >, + is_device_iterator, + mpl::not_< + is_same_value_type + > + > + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator::value, + "copy_async() is only supported for contiguous host iterators" + ); + + typedef typename std::iterator_traits::value_type input_type; + + const context &context = queue.get_context(); + size_t count = iterator_range_size(first, last); + + if(count < size_t(1)) { + return future(); + } + + // map [first; last) to device and run copy kernel + // on device for copying & casting + ::boost::compute::mapped_view mapped_host( + // make sure it's a pointer to constant data + // to force read only mapping + const_cast( + ::boost::addressof(*first) + ), + count, + context + ); + return copy_on_device_async( + mapped_host.begin(), mapped_host.end(), result, queue + ); +} + // host -> device // InputIterator is a contiguous iterator template @@ -143,7 +218,6 @@ dispatch_copy(InputIterator first, typedef typename OutputIterator::value_type output_type; typedef typename std::iterator_traits::value_type input_type; - const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters @@ -196,16 +270,12 @@ dispatch_copy(InputIterator first, // [direct_copy_threshold; inf) -> map [first; last) to device and // run copy kernel on device for copying & casting // At this point we are sure that count > 1 (first != last). - ::boost::compute::mapped_view mapped_host( - // make sure it's a pointer to constant data - // to force read only mapping - const_cast( - ::boost::addressof(*first) - ), - count, - context - ); - return copy_on_device(mapped_host.begin(), mapped_host.end(), result, queue); + + // Perform async copy to device, wait for it to be finished and + // return the result. + // At this point we are sure that count > 1 (first != last), so event + // returned by dispatch_copy_async() must be valid. + return dispatch_copy_async(first, last, result, queue).get(); } // host -> device @@ -283,7 +353,7 @@ dispatch_copy(InputIterator first, return copy_to_device(vector.begin(), vector.end(), result, queue); } -// host -> device (async) +// device -> host (async) template inline future dispatch_copy_async(InputIterator first, @@ -292,23 +362,23 @@ dispatch_copy_async(InputIterator first, command_queue &queue, typename boost::enable_if< mpl::and_< + is_device_iterator, mpl::not_< - is_device_iterator + is_device_iterator >, - is_device_iterator, - is_same_value_type + is_same_value_type > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( - is_contiguous_iterator::value, + is_contiguous_iterator::value, "copy_async() is only supported for contiguous host iterators" ); - return copy_to_device_async(first, last, result, queue); + return copy_to_host_async(first, last, result, queue); } -// host -> device (async) +// device -> host (async) // Type mismatch between InputIterator and OutputIterator value_types template inline future @@ -318,23 +388,22 @@ dispatch_copy_async(InputIterator first, command_queue &queue, typename boost::enable_if< mpl::and_< + is_device_iterator, mpl::not_< - is_device_iterator + is_device_iterator >, - is_device_iterator, mpl::not_< - is_same_value_type + is_same_value_type > > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( - is_contiguous_iterator::value, + is_contiguous_iterator::value, "copy_async() is only supported for contiguous host iterators" ); - typedef typename std::iterator_traits::value_type input_type; - + typedef typename std::iterator_traits::value_type output_type; const context &context = queue.get_context(); size_t count = iterator_range_size(first, last); @@ -342,20 +411,36 @@ dispatch_copy_async(InputIterator first, return future(); } - // map [first; last) to device and run copy kernel - // on device for copying & casting - ::boost::compute::mapped_view mapped_host( - // make sure it's a pointer to constant data - // to force read only mapping - const_cast( - ::boost::addressof(*first) - ), - count, - context + // map host memory to device + buffer mapped_host( + context, + count * sizeof(output_type), + buffer::write_only | buffer::use_host_ptr, + static_cast( + ::boost::addressof(*result) + ) ); - return copy_on_device_async( - mapped_host.begin(), mapped_host.end(), result, queue + // copy async on device + ::boost::compute::future > future = + copy_on_device_async( + first, + last, + make_buffer_iterator(mapped_host), + queue + ); + // update host memory asynchronously by maping and unmaping memory + event map_event; + void* ptr = queue.enqueue_map_buffer_async( + mapped_host, + CL_MAP_READ, + 0, + count * sizeof(output_type), + map_event, + future.get_event() ); + event unmap_event = + queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); + return make_future(result + count, unmap_event); } // device -> host @@ -493,7 +578,6 @@ dispatch_copy(InputIterator first, typedef typename std::iterator_traits::value_type output_type; typedef typename InputIterator::value_type input_type; - const context &context = queue.get_context(); const device &device = queue.get_device(); // loading parameters @@ -547,122 +631,12 @@ dispatch_copy(InputIterator first, // [direct_copy_threshold; inf) -> map [result; result + input_size) to // device and run copy kernel on device for copying & casting // map host memory to device. - // At this point we are sure that count > 1 (first != last). - buffer mapped_host( - context, - count * sizeof(output_type), - buffer::write_only | buffer::use_host_ptr, - static_cast( - ::boost::addressof(*result) - ) - ); - copy_on_device( - first, - last, - make_buffer_iterator(mapped_host), - queue - ); - // synchronously update host memory by mapping and unmapping memory - event map_event; - void* ptr = queue.enqueue_map_buffer_async( - mapped_host, - CL_MAP_READ, - 0, - count * sizeof(output_type), - map_event - ); - queue.enqueue_unmap_buffer(mapped_host, ptr, map_event).wait(); - return iterator_plus_distance(result, count); -} - -// device -> host (async) -template -inline future -dispatch_copy_async(InputIterator first, - InputIterator last, - OutputIterator result, - command_queue &queue, - typename boost::enable_if< - mpl::and_< - is_device_iterator, - mpl::not_< - is_device_iterator - >, - is_same_value_type - > - >::type* = 0) -{ - BOOST_STATIC_ASSERT_MSG( - is_contiguous_iterator::value, - "copy_async() is only supported for contiguous host iterators" - ); - return copy_to_host_async(first, last, result, queue); -} - -// device -> host (async) -// Type mismatch between InputIterator and OutputIterator value_types -template -inline future -dispatch_copy_async(InputIterator first, - InputIterator last, - OutputIterator result, - command_queue &queue, - typename boost::enable_if< - mpl::and_< - is_device_iterator, - mpl::not_< - is_device_iterator - >, - mpl::not_< - is_same_value_type - > - > - >::type* = 0) -{ - BOOST_STATIC_ASSERT_MSG( - is_contiguous_iterator::value, - "copy_async() is only supported for contiguous host iterators" - ); - - typedef typename std::iterator_traits::value_type output_type; - const context &context = queue.get_context(); - size_t count = iterator_range_size(first, last); - - if(count < size_t(1)) { - return future(); - } - - // map host memory to device - buffer mapped_host( - context, - count * sizeof(output_type), - buffer::write_only | buffer::use_host_ptr, - static_cast( - ::boost::addressof(*result) - ) - ); - // copy async on device - ::boost::compute::future > future = - copy_on_device_async( - first, - last, - make_buffer_iterator(mapped_host), - queue - ); - // update host memory asynchronously by maping and unmaping memory - event map_event; - void* ptr = queue.enqueue_map_buffer_async( - mapped_host, - CL_MAP_READ, - 0, - count * sizeof(output_type), - map_event, - future.get_event() - ); - event unmap_event = - queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); - return make_future(result + count, unmap_event); + // Perform async copy to host, wait for it to be finished and + // return the result. + // At this point we are sure that count > 1 (first != last), so event + // returned by dispatch_copy_async() must be valid. + return dispatch_copy_async(first, last, result, queue).get(); } // device -> device