From 0e0d03bac5d8b9d2432612b5ced0b61400f1f9f0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Jun 2020 18:21:54 -0400 Subject: [PATCH 01/17] Add new BVH::query() overload that takes only predicates and callback --- src/ArborX_LinearBVH.hpp | 15 ++-------- ...orX_DetailsBoundingVolumeHierarchyImpl.hpp | 28 ++++++++++++++++++- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/ArborX_LinearBVH.hpp b/src/ArborX_LinearBVH.hpp index d9a9ba33f..62b783d00 100644 --- a/src/ArborX_LinearBVH.hpp +++ b/src/ArborX_LinearBVH.hpp @@ -55,10 +55,8 @@ class BoundingVolumeHierarchy KOKKOS_FUNCTION bounding_volume_type bounds() const noexcept { return _bounds; } - template + template void query(ExecutionSpace const &space, Predicates const &predicates, - CallbackOrView &&callback_or_view, View &&view, Args &&... args) const { Details::check_valid_access_traits(PredicatesTag{}, predicates); @@ -67,15 +65,8 @@ class BoundingVolumeHierarchy ExecutionSpace>::value, "Predicates must be accessible from the execution space"); - Details::check_valid_callback_if_first_argument_is_not_a_view( - callback_or_view, predicates, view); - - using Tag = typename Details::AccessTraitsHelper::tag; - - Details::BoundingVolumeHierarchyImpl::queryDispatch( - Tag{}, *this, space, predicates, - std::forward(callback_or_view), - std::forward(view), std::forward(args)...); + Details::BoundingVolumeHierarchyImpl::query(space, *this, predicates, + std::forward(args)...); } private: diff --git a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp index 66526858c..097b3608d 100644 --- a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp +++ b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp @@ -322,7 +322,6 @@ queryDispatch(NearestPredicateTag, BVH const &bvh, ExecutionSpace const &space, distances(i) = out(i).second; }); } -} // namespace BoundingVolumeHierarchyImpl template std::enable_if_t{} && @@ -353,6 +352,33 @@ check_valid_callback_if_first_argument_is_not_a_view(View const &, // do nothing } +template +inline void +query(ExecutionSpace const &space, BVH const &bvh, Predicates const &predicates, + CallbackOrView &&callback_or_view, View &&view, Args &&... args) +{ + check_valid_callback_if_first_argument_is_not_a_view(callback_or_view, + predicates, view); + + using Access = AccessTraits; + using Tag = typename AccessTraitsHelper::tag; + + queryDispatch(Tag{}, bvh, space, predicates, + std::forward(callback_or_view), + std::forward(view), std::forward(args)...); +} + +template +inline void query(ExecutionSpace const &space, BVH const &bvh, + Predicates const &predicates, Callback const &callback) +{ + // TODO check signature of the callback + traverse(space, bvh, predicates, callback); +} + +} // namespace BoundingVolumeHierarchyImpl } // namespace Details } // namespace ArborX From 125b9c5e51e9e67881ae4aef2290f940d588f7d9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Jun 2020 18:22:55 -0400 Subject: [PATCH 02/17] Use new overload in callback example --- examples/callback/example_callback.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/callback/example_callback.cpp b/examples/callback/example_callback.cpp index 4c964cd94..e8726dc90 100644 --- a/examples/callback/example_callback.cpp +++ b/examples/callback/example_callback.cpp @@ -132,14 +132,12 @@ int main(int argc, char *argv[]) { // EXPERIMENTAL - // TODO replace with BVH::query(ExecutionSpace, Predicates, Callback) when - // new overload is added Kokkos::View> c( "counter"); - ArborX::Details::traverse( - ExecutionSpace{}, bvh, FirstOctant{}, - KOKKOS_LAMBDA(int i, int j) { printf("%d %d %d\n", ++c(), i, j); }); + bvh.query(ExecutionSpace{}, FirstOctant{}, KOKKOS_LAMBDA(int i, int j) { + printf("%d %d %d\n", ++c(), i, j); + }); } return 0; From f96904411a641acb3f3686915030634febb37fe0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 1 Jun 2020 19:05:10 -0400 Subject: [PATCH 03/17] Enable optional permutation of the predicates in the new overload --- ...orX_DetailsBoundingVolumeHierarchyImpl.hpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp index 097b3608d..a61649258 100644 --- a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp +++ b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp @@ -372,10 +372,30 @@ query(ExecutionSpace const &space, BVH const &bvh, Predicates const &predicates, template inline void query(ExecutionSpace const &space, BVH const &bvh, - Predicates const &predicates, Callback const &callback) + Predicates const &predicates, Callback const &callback, + Experimental::TraversalPolicy const &policy = + Experimental::TraversalPolicy()) { // TODO check signature of the callback - traverse(space, bvh, predicates, callback); + if (policy._sort_predicates) + { + Kokkos::Profiling::pushRegion("ArborX:BVH:compute_permutation"); + using MemorySpace = typename BVH::memory_space; + using DeviceType = Kokkos::Device; + auto permute = + Details::BatchedQueries::sortQueriesAlongZOrderCurve( + space, bvh.bounds(), predicates); + Kokkos::Profiling::popRegion(); + + traverse( + space, bvh, + PermutedPredicates{predicates, permute}, + callback); + } + else + { + traverse(space, bvh, predicates, callback); + } } } // namespace BoundingVolumeHierarchyImpl From 1c3729ec5e70befa88acc0309ee0d5339e9176cf Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 2 Jun 2020 09:32:37 -0400 Subject: [PATCH 04/17] Fixup resolve ambiguity between Details::BVHImplquery() overloads --- src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp index a61649258..eab30c4c6 100644 --- a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp +++ b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp @@ -354,7 +354,7 @@ check_valid_callback_if_first_argument_is_not_a_view(View const &, template -inline void +inline std::enable_if_t>{}> query(ExecutionSpace const &space, BVH const &bvh, Predicates const &predicates, CallbackOrView &&callback_or_view, View &&view, Args &&... args) { From f06c58cf329d4f4a870dbfb9fc452e243f8bae21 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:15:37 -0400 Subject: [PATCH 05/17] Mark getData(predicate_with_attachment) noexcept --- src/details/ArborX_Predicates.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/details/ArborX_Predicates.hpp b/src/details/ArborX_Predicates.hpp index df7b06310..8796c0c4a 100644 --- a/src/details/ArborX_Predicates.hpp +++ b/src/details/ArborX_Predicates.hpp @@ -119,7 +119,7 @@ struct PredicateWithAttachment : Predicate template KOKKOS_INLINE_FUNCTION Data const & -getData(PredicateWithAttachment const &pred) +getData(PredicateWithAttachment const &pred) noexcept { return pred._data; } From 7ea4e816571ab5e4b7985a33aedc1c6d278d0a24 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:16:24 -0400 Subject: [PATCH 06/17] Add getPredicate(predicate_with_attachment) --- src/details/ArborX_Predicates.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/details/ArborX_Predicates.hpp b/src/details/ArborX_Predicates.hpp index 8796c0c4a..559132903 100644 --- a/src/details/ArborX_Predicates.hpp +++ b/src/details/ArborX_Predicates.hpp @@ -124,6 +124,13 @@ getData(PredicateWithAttachment const &pred) noexcept return pred._data; } +template +KOKKOS_INLINE_FUNCTION Predicate const & +getPredicate(PredicateWithAttachment const &pred) noexcept +{ + return static_cast(pred); // slicing +} + template KOKKOS_INLINE_FUNCTION constexpr auto attach(Predicate &&pred, Data &&data) { From 31141f1292f6c8f6f229fc3e1ca4c77eab7c1509 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:18:15 -0400 Subject: [PATCH 07/17] Temp fix in callback example (pretty sure format is wrong) --- examples/callback/example_callback.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/callback/example_callback.cpp b/examples/callback/example_callback.cpp index e8726dc90..1e9b7345f 100644 --- a/examples/callback/example_callback.cpp +++ b/examples/callback/example_callback.cpp @@ -135,9 +135,11 @@ int main(int argc, char *argv[]) Kokkos::View> c( "counter"); - bvh.query(ExecutionSpace{}, FirstOctant{}, KOKKOS_LAMBDA(int i, int j) { - printf("%d %d %d\n", ++c(), i, j); - }); + bvh.query( + ExecutionSpace{}, FirstOctant{}, + KOKKOS_LAMBDA(auto /*predicate*/, int j) { + printf("%d %d %d\n", ++c(), -1, j); + }); } return 0; From 66a84ac1fbde4652b24da906f67cff733db9a184 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:19:08 -0400 Subject: [PATCH 08/17] Update InsertGenerator::operator() to take predicate as first argument --- .../ArborX_DetailsBufferOptimization.hpp | 78 ++++++++++++------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index b430d39ce..533e9dd9e 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -64,13 +64,17 @@ struct InsertGenerator using ValueType = typename OutputView::value_type; using Access = AccessTraits; using Tag = typename AccessTraitsHelper::tag; + using PredicateType = typename AccessTraitsHelper::type; template KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index) const + operator()(PredicateType const &predicate, int primitive_index) const { - auto const permuted_predicate_index = _permute(predicate_index); + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); // With permutation, we access offset in random manner, and // _offset(permutated_predicate_index+1) may be in a completely different // place. Instead, use pointers to get the correct value for the buffer @@ -79,19 +83,22 @@ struct InsertGenerator auto const buffer_size = *(&offset + 1) - offset; auto &count = _counts(predicate_index); - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, [&](ValueType const &value) { - int count_old = Kokkos::atomic_fetch_add(&count, 1); - if (count_old < buffer_size) - _out(offset + count_old) = value; - }); + _callback(raw_predicate, primitive_index, [&](ValueType const &value) { + int count_old = Kokkos::atomic_fetch_add(&count, 1); + if (count_old < buffer_size) + _out(offset + count_old) = value; + }); } template KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index, float distance) const + operator()(PredicateType const &predicate, int primitive_index, + float distance) const { - auto const permuted_predicate_index = _permute(predicate_index); + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); // With permutation, we access offset in random manner, and // _offset(permutated_predicate_index+1) may be in a completely different // place. Instead, use pointers to get the correct value for the buffer @@ -100,8 +107,8 @@ struct InsertGenerator auto const buffer_size = *(&offset + 1) - offset; auto &count = _counts(predicate_index); - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, distance, [&](ValueType const &value) { + _callback(raw_predicate, primitive_index, distance, + [&](ValueType const &value) { int count_old = Kokkos::atomic_fetch_add(&count, 1); if (count_old < buffer_size) _out(offset + count_old) = value; @@ -112,12 +119,16 @@ struct InsertGenerator KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index) const + operator()(PredicateType const &predicate, int primitive_index) const { + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); + auto &count = _counts(predicate_index); - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, + _callback(raw_predicate, primitive_index, [&](ValueType const &) { Kokkos::atomic_fetch_add(&count, 1); }); } @@ -125,20 +136,30 @@ struct InsertGenerator KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index, float distance) const + operator()(PredicateType const &predicate, int primitive_index, + float distance) const { + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); + auto &count = _counts(predicate_index); - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, distance, + _callback(raw_predicate, primitive_index, distance, [&](ValueType const &) { Kokkos::atomic_fetch_add(&count, 1); }); } template KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index) const + operator()(PredicateType const &predicate, int primitive_index) const { + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); + // we store offsets in counts, and offset(permute(i)) = counts(i) auto &offset = _counts(predicate_index); @@ -146,17 +167,22 @@ struct InsertGenerator // count, and atomic increment of count. I think atomically incrementing // offset is problematic for OpenMP as you potentially constantly steal // cache lines. - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, [&](ValueType const &value) { - _out(Kokkos::atomic_fetch_add(&offset, 1)) = value; - }); + _callback(raw_predicate, primitive_index, [&](ValueType const &value) { + _out(Kokkos::atomic_fetch_add(&offset, 1)) = value; + }); } template KOKKOS_FUNCTION std::enable_if_t{} && std::is_same{}> - operator()(int predicate_index, int primitive_index, float distance) const + operator()(PredicateType const &predicate, int primitive_index, + float distance) const { + auto const &data = getData(predicate); + auto const predicate_index = data.original; + auto const permuted_predicate_index = data.permuted; + auto const &raw_predicate = getPredicate(predicate); + // we store offsets in counts, and offset(permute(i)) = counts(i) auto &offset = _counts(predicate_index); @@ -164,8 +190,8 @@ struct InsertGenerator // count, and atomic increment of count. I think atomically incrementing // offset is problematic for OpenMP as you potentially constantly steal // cache lines. - _callback(Access::get(_permuted_predicates, predicate_index), - primitive_index, distance, [&](ValueType const &value) { + _callback(raw_predicate, primitive_index, distance, + [&](ValueType const &value) { _out(Kokkos::atomic_fetch_add(&offset, 1)) = value; }); } From 480771b9592b788141003ac6095a952373c0aa23 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:20:08 -0400 Subject: [PATCH 09/17] Update Traits::Access attach the indices --- .../ArborX_DetailsBufferOptimization.hpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index 533e9dd9e..d7215acf9 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -208,6 +208,13 @@ struct PermutedPredicates } }; +// TODO find a better name and think harder about int versus size_t +struct PermutedIndices +{ + int original; + int permuted; +}; + } // namespace Details template @@ -217,16 +224,18 @@ struct AccessTraits, using PermutedPredicates = Details::PermutedPredicates; using NativeAccess = AccessTraits; - inline static std::size_t size(PermutedPredicates const &permuted_predicates) + static std::size_t size(PermutedPredicates const &permuted_predicates) { return NativeAccess::size(permuted_predicates._predicates); } - KOKKOS_INLINE_FUNCTION static auto - get(PermutedPredicates const &permuted_predicates, std::size_t i) + KOKKOS_FUNCTION static auto get(PermutedPredicates const &permuted_predicates, + std::size_t index) { - return NativeAccess::get(permuted_predicates._predicates, - permuted_predicates._permute(i)); + auto const permuted_index = permuted_predicates._permute(index); + return attach( + NativeAccess::get(permuted_predicates._predicates, permuted_index), + Details::PermutedIndices{(int)index, (int)permuted_index}); } using memory_space = typename NativeAccess::memory_space; }; From d7d28efb745ed635fa70f43d8638ba87cf02698b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:20:56 -0400 Subject: [PATCH 10/17] Fixup clang-format gone wrong (rogue end namespace) --- src/details/ArborX_DetailsBufferOptimization.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index d7215acf9..f076980b9 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -401,7 +401,7 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, // The allocated storage was exactly enough for results, do nothing } Kokkos::Profiling::popRegion(); -} // namespace Details +} } // namespace Details } // namespace ArborX From f88e929d995ab98b8cb00341cfd7be10e4dc2072 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:21:31 -0400 Subject: [PATCH 11/17] Update TreeTraversal callback takes predicate as first argument --- src/details/ArborX_DetailsTreeTraversal.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/details/ArborX_DetailsTreeTraversal.hpp b/src/details/ArborX_DetailsTreeTraversal.hpp index 7bda2fb72..c26766021 100644 --- a/src/details/ArborX_DetailsTreeTraversal.hpp +++ b/src/details/ArborX_DetailsTreeTraversal.hpp @@ -77,7 +77,7 @@ struct TreeTraversal if (predicate(bvh_.getBoundingVolume(bvh_.getRoot()))) { - callback_(queryIndex, 0); + callback_(predicate, 0); } } @@ -99,11 +99,11 @@ struct TreeTraversal if (overlap_left && child_left->isLeaf()) { - callback_(queryIndex, child_left->getLeafPermutationIndex()); + callback_(predicate, child_left->getLeafPermutationIndex()); } if (overlap_right && child_right->isLeaf()) { - callback_(queryIndex, child_right->getLeafPermutationIndex()); + callback_(predicate, child_right->getLeafPermutationIndex()); } bool traverse_left = (overlap_left && !child_left->isLeaf()); @@ -226,7 +226,7 @@ struct TreeTraversal if (k < 1) return 0; - callback_(queryIndex, 0, distance(bvh_.getRoot())); + callback_(predicate, 0, distance(bvh_.getRoot())); return 1; } @@ -353,7 +353,7 @@ struct TreeTraversal { int const leaf_index = (heap.data() + i)->first; auto const leaf_distance = (heap.data() + i)->second; - callback_(queryIndex, leaf_index, leaf_distance); + callback_(predicate, leaf_index, leaf_distance); } return heap.size(); } @@ -408,7 +408,7 @@ struct TreeTraversal if (node->isLeaf()) { queue.pop(); - callback_(queryIndex, node->getLeafPermutationIndex(), node_distance); + callback_(predicate, node->getLeafPermutationIndex(), node_distance); ++count; } else From 4458873576e5887b3b0dcf2e104c09583d2e1cba Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:22:16 -0400 Subject: [PATCH 12/17] Update buffer optimization unit test to get it to compile --- test/tstDetailsBufferOptimization.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/tstDetailsBufferOptimization.cpp b/test/tstDetailsBufferOptimization.cpp index 26e3a3e3e..e54f7a879 100644 --- a/test/tstDetailsBufferOptimization.cpp +++ b/test/tstDetailsBufferOptimization.cpp @@ -36,7 +36,10 @@ struct Test1 KOKKOS_LAMBDA(int predicate_index) { for (int primitive_index = 0; primitive_index < predicate_index; ++primitive_index) - insert_generator(predicate_index, primitive_index); + insert_generator(attach(Access::get(predicates, predicate_index), + ArborX::Details::PermutedIndices{ + predicate_index, predicate_index}), + primitive_index); }); } }; @@ -51,7 +54,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(query_impl, DeviceType, ARBORX_DEVICE_TYPES) int const n = 10; // Build a view of predicates. Won't actually call any of them. All is // required is a valid access traits assocated to it. 'get()' nevers get - // called, only 'size()'. + // called, only 'size()'. FIXME using Predicate = decltype(ArborX::intersects(std::declval())); Kokkos::View predicates( Kokkos::view_alloc("predicates", Kokkos::WithoutInitializing), n); From daa5c116523a40256bfc73ce8549aea4e3546b05 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:28:20 -0400 Subject: [PATCH 13/17] Drop vector with permutation indices in InsertGenerator --- .../ArborX_DetailsBufferOptimization.hpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index f076980b9..2835a0c82 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -50,8 +50,7 @@ struct SecondPassTag }; template + typename OutputView, typename CountView, typename OffsetView> struct InsertGenerator { Predicates _permuted_predicates; @@ -59,7 +58,6 @@ struct InsertGenerator OutputView _out; CountView _counts; OffsetView _offset; - PermuteType _permute; using ValueType = typename OutputView::value_type; using Access = AccessTraits; @@ -275,8 +273,8 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, tree_traversal.launch( space, permuted_predicates, InsertGenerator{ - permuted_predicates, callback, out, counts, offset, permute}); + CountView, OffsetView>{permuted_predicates, callback, + out, counts, offset}); // Detecting overflow is a local operation that needs to be done for every // index. We allow individual buffer sizes to differ, so it's not as easy @@ -309,9 +307,8 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, tree_traversal.launch( space, permuted_predicates, InsertGenerator{permuted_predicates, callback, out, counts, - offset, permute}); + Callback, OutputView, CountView, OffsetView>{ + permuted_predicates, callback, out, counts, offset}); // This may not be true, but it does not matter. As long as we have // (n_results == 0) check before second pass, this value is not used. // Otherwise, we know it's overflowed as there is no allocation. @@ -369,8 +366,8 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, tree_traversal.launch( space, permuted_predicates, InsertGenerator{ - permuted_predicates, callback, out, counts, offset, permute}); + CountView, OffsetView>{permuted_predicates, callback, + out, counts, offset}); Kokkos::Profiling::popRegion(); } From 19734949e872d4991808de748745686f3618c67b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 09:42:42 -0400 Subject: [PATCH 14/17] Fixup do not attemt to use __host__ __device__ generic lambdas with NVCC --- examples/callback/example_callback.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/callback/example_callback.cpp b/examples/callback/example_callback.cpp index 1e9b7345f..e34b4de3a 100644 --- a/examples/callback/example_callback.cpp +++ b/examples/callback/example_callback.cpp @@ -135,11 +135,13 @@ int main(int argc, char *argv[]) Kokkos::View> c( "counter"); +#ifndef __NVCC__ bvh.query( ExecutionSpace{}, FirstOctant{}, KOKKOS_LAMBDA(auto /*predicate*/, int j) { printf("%d %d %d\n", ++c(), -1, j); }); +#endif } return 0; From 6468d0764f25daab61fc156dc57afd43165ccdfe Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 3 Jun 2020 18:10:41 -0400 Subject: [PATCH 15/17] Cheap fix so indices are not attached to the predicates before being passed to the callback in the new overload --- .../ArborX_DetailsBufferOptimization.hpp | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index 2835a0c82..c8126f486 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -195,7 +195,7 @@ struct InsertGenerator } }; -template +template struct PermutedPredicates { Predicates _predicates; @@ -215,11 +215,13 @@ struct PermutedIndices } // namespace Details -template -struct AccessTraits, - PredicatesTag> +template +struct AccessTraits< + Details::PermutedPredicates, + PredicatesTag> { - using PermutedPredicates = Details::PermutedPredicates; + using PermutedPredicates = + Details::PermutedPredicates; using NativeAccess = AccessTraits; static std::size_t size(PermutedPredicates const &permuted_predicates) @@ -227,14 +229,23 @@ struct AccessTraits, return NativeAccess::size(permuted_predicates._predicates); } + template KOKKOS_FUNCTION static auto get(PermutedPredicates const &permuted_predicates, - std::size_t index) + std::enable_if_t<_Attach, std::size_t> index) { auto const permuted_index = permuted_predicates._permute(index); return attach( NativeAccess::get(permuted_predicates._predicates, permuted_index), Details::PermutedIndices{(int)index, (int)permuted_index}); } + + template + KOKKOS_FUNCTION static auto get(PermutedPredicates const &permuted_predicates, + std::enable_if_t index) + { + auto const permuted_index = permuted_predicates._permute(index); + return NativeAccess::get(permuted_predicates._predicates, permuted_index); + } using memory_space = typename NativeAccess::memory_space; }; @@ -262,7 +273,7 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, using CountView = OffsetView; CountView counts(Kokkos::view_alloc("counts", space), n_queries); - using PermutedPredicates = PermutedPredicates; + using PermutedPredicates = PermutedPredicates; PermutedPredicates permuted_predicates = {predicates, permute}; Kokkos::Profiling::pushRegion("ArborX:BVH:two_pass:first_pass"); From 7d4ebbe2a09784ae6e5b40814e99eb31110b36ed Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Fri, 7 Aug 2020 14:56:00 -0400 Subject: [PATCH 16/17] Minor change to improve performance --- ...orX_DetailsBoundingVolumeHierarchyImpl.hpp | 6 +- .../ArborX_DetailsBufferOptimization.hpp | 91 +++++++------------ test/tstDetailsBufferOptimization.cpp | 3 +- 3 files changed, 38 insertions(+), 62 deletions(-) diff --git a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp index eab30c4c6..74ba20f04 100644 --- a/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp +++ b/src/details/ArborX_DetailsBoundingVolumeHierarchyImpl.hpp @@ -387,10 +387,8 @@ inline void query(ExecutionSpace const &space, BVH const &bvh, space, bvh.bounds(), predicates); Kokkos::Profiling::popRegion(); - traverse( - space, bvh, - PermutedPredicates{predicates, permute}, - callback); + using PermutedPredicates = PermutedData; + traverse(space, bvh, PermutedPredicates{predicates, permute}, callback); } else { diff --git a/src/details/ArborX_DetailsBufferOptimization.hpp b/src/details/ArborX_DetailsBufferOptimization.hpp index c8126f486..3b9643a30 100644 --- a/src/details/ArborX_DetailsBufferOptimization.hpp +++ b/src/details/ArborX_DetailsBufferOptimization.hpp @@ -50,14 +50,13 @@ struct SecondPassTag }; template + typename OutputView, typename CountView, typename PermutedOffset> struct InsertGenerator { - Predicates _permuted_predicates; Callback _callback; OutputView _out; CountView _counts; - OffsetView _offset; + PermutedOffset _permuted_offset; using ValueType = typename OutputView::value_type; using Access = AccessTraits; @@ -69,15 +68,13 @@ struct InsertGenerator std::is_same{}> operator()(PredicateType const &predicate, int primitive_index) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); // With permutation, we access offset in random manner, and // _offset(permutated_predicate_index+1) may be in a completely different // place. Instead, use pointers to get the correct value for the buffer // size. For this reason, also take a reference for offset. - auto const &offset = _offset(permuted_predicate_index); + auto const &offset = _permuted_offset(predicate_index); auto const buffer_size = *(&offset + 1) - offset; auto &count = _counts(predicate_index); @@ -93,15 +90,13 @@ struct InsertGenerator operator()(PredicateType const &predicate, int primitive_index, float distance) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); // With permutation, we access offset in random manner, and // _offset(permutated_predicate_index+1) may be in a completely different // place. Instead, use pointers to get the correct value for the buffer // size. For this reason, also take a reference for offset. - auto const &offset = _offset(permuted_predicate_index); + auto const &offset = _permuted_offset(predicate_index); auto const buffer_size = *(&offset + 1) - offset; auto &count = _counts(predicate_index); @@ -119,9 +114,7 @@ struct InsertGenerator std::is_same{}> operator()(PredicateType const &predicate, int primitive_index) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); auto &count = _counts(predicate_index); @@ -137,9 +130,7 @@ struct InsertGenerator operator()(PredicateType const &predicate, int primitive_index, float distance) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); auto &count = _counts(predicate_index); @@ -153,9 +144,7 @@ struct InsertGenerator std::is_same{}> operator()(PredicateType const &predicate, int primitive_index) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); // we store offsets in counts, and offset(permute(i)) = counts(i) @@ -176,9 +165,7 @@ struct InsertGenerator operator()(PredicateType const &predicate, int primitive_index, float distance) const { - auto const &data = getData(predicate); - auto const predicate_index = data.original; - auto const permuted_predicate_index = data.permuted; + auto const predicate_index = getData(predicate); auto const &raw_predicate = getPredicate(predicate); // we store offsets in counts, and offset(permute(i)) = counts(i) @@ -195,38 +182,27 @@ struct InsertGenerator } }; -template -struct PermutedPredicates +template +struct PermutedData { - Predicates _predicates; + Data _data; Permute _permute; - KOKKOS_FUNCTION auto operator()(int i) const - { - return _predicates(_permute(i)); - } -}; - -// TODO find a better name and think harder about int versus size_t -struct PermutedIndices -{ - int original; - int permuted; + KOKKOS_FUNCTION auto &operator()(int i) const { return _data(_permute(i)); } }; } // namespace Details template -struct AccessTraits< - Details::PermutedPredicates, - PredicatesTag> +struct AccessTraits, + PredicatesTag> { using PermutedPredicates = - Details::PermutedPredicates; + Details::PermutedData; using NativeAccess = AccessTraits; static std::size_t size(PermutedPredicates const &permuted_predicates) { - return NativeAccess::size(permuted_predicates._predicates); + return NativeAccess::size(permuted_predicates._data); } template @@ -234,9 +210,8 @@ struct AccessTraits< std::enable_if_t<_Attach, std::size_t> index) { auto const permuted_index = permuted_predicates._permute(index); - return attach( - NativeAccess::get(permuted_predicates._predicates, permuted_index), - Details::PermutedIndices{(int)index, (int)permuted_index}); + return attach(NativeAccess::get(permuted_predicates._data, permuted_index), + (int)index); } template @@ -244,7 +219,7 @@ struct AccessTraits< std::enable_if_t index) { auto const permuted_index = permuted_predicates._permute(index); - return NativeAccess::get(permuted_predicates._predicates, permuted_index); + return NativeAccess::get(permuted_predicates._data, permuted_index); } using memory_space = typename NativeAccess::memory_space; }; @@ -273,9 +248,13 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, using CountView = OffsetView; CountView counts(Kokkos::view_alloc("counts", space), n_queries); - using PermutedPredicates = PermutedPredicates; + using PermutedPredicates = + PermutedData; PermutedPredicates permuted_predicates = {predicates, permute}; + using PermutedOffset = PermutedData; + PermutedOffset permuted_offset = {offset, permute}; + Kokkos::Profiling::pushRegion("ArborX:BVH:two_pass:first_pass"); bool underflow = false; bool overflow = false; @@ -284,8 +263,8 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, tree_traversal.launch( space, permuted_predicates, InsertGenerator{permuted_predicates, callback, - out, counts, offset}); + CountView, PermutedOffset>{callback, out, counts, + permuted_offset}); // Detecting overflow is a local operation that needs to be done for every // index. We allow individual buffer sizes to differ, so it's not as easy @@ -295,7 +274,7 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, ARBORX_MARK_REGION("compute_overflow"), Kokkos::RangePolicy(space, 0, n_queries), KOKKOS_LAMBDA(int i, int &update) { - auto const *const offset_ptr = &offset(permute(i)); + auto const *const offset_ptr = &permuted_offset(i); if (counts(i) > *(offset_ptr + 1) - *offset_ptr) update = 1; }, @@ -318,8 +297,8 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, tree_traversal.launch( space, permuted_predicates, InsertGenerator{ - permuted_predicates, callback, out, counts, offset}); + Callback, OutputView, CountView, PermutedOffset>{ + callback, out, counts, permuted_offset}); // This may not be true, but it does not matter. As long as we have // (n_results == 0) check before second pass, this value is not used. // Otherwise, we know it's overflowed as there is no allocation. @@ -339,7 +318,7 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, Kokkos::parallel_for( ARBORX_MARK_REGION("copy_counts_to_offsets"), Kokkos::RangePolicy(space, 0, n_queries), - KOKKOS_LAMBDA(int const i) { offset(permute(i)) = counts(i); }); + KOKKOS_LAMBDA(int const i) { permuted_offset(i) = counts(i); }); exclusivePrefixSum(space, offset); int const n_results = lastElement(offset); @@ -370,15 +349,15 @@ void queryImpl(ExecutionSpace const &space, TreeTraversal const &tree_traversal, Kokkos::parallel_for( ARBORX_MARK_REGION("copy_offsets_to_counts"), Kokkos::RangePolicy(space, 0, n_queries), - KOKKOS_LAMBDA(int const i) { counts(i) = offset(permute(i)); }); + KOKKOS_LAMBDA(int const i) { counts(i) = permuted_offset(i); }); reallocWithoutInitializing(out, n_results); tree_traversal.launch( space, permuted_predicates, InsertGenerator{permuted_predicates, callback, - out, counts, offset}); + CountView, PermutedOffset>{callback, out, counts, + permuted_offset}); Kokkos::Profiling::popRegion(); } diff --git a/test/tstDetailsBufferOptimization.cpp b/test/tstDetailsBufferOptimization.cpp index e54f7a879..298260866 100644 --- a/test/tstDetailsBufferOptimization.cpp +++ b/test/tstDetailsBufferOptimization.cpp @@ -37,8 +37,7 @@ struct Test1 for (int primitive_index = 0; primitive_index < predicate_index; ++primitive_index) insert_generator(attach(Access::get(predicates, predicate_index), - ArborX::Details::PermutedIndices{ - predicate_index, predicate_index}), + predicate_index), primitive_index); }); } From 6f3dc47be7be6cb8576f388971a83063df1f13e8 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Fri, 7 Aug 2020 17:47:45 -0400 Subject: [PATCH 17/17] Fix style --- examples/callback/example_callback.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/callback/example_callback.cpp b/examples/callback/example_callback.cpp index e34b4de3a..832e8fab8 100644 --- a/examples/callback/example_callback.cpp +++ b/examples/callback/example_callback.cpp @@ -136,11 +136,10 @@ int main(int argc, char *argv[]) "counter"); #ifndef __NVCC__ - bvh.query( - ExecutionSpace{}, FirstOctant{}, - KOKKOS_LAMBDA(auto /*predicate*/, int j) { - printf("%d %d %d\n", ++c(), -1, j); - }); + bvh.query(ExecutionSpace{}, FirstOctant{}, + KOKKOS_LAMBDA(auto /*predicate*/, int j) { + printf("%d %d %d\n", ++c(), -1, j); + }); #endif }