kokkos#5333: CUDA: Use scratch space appropriate to small reduction e…

…lements in Team reductions (kokkos#5334) * kokkos#5333: Add maybe failing test case? * Revise test to try to target the right code * NOMERGE kokkos#5333: Partial implementation of fix * kokkos#5333: Update tests * kokkos#5333: Fix comparision types in reducers test * kokkos#5333: Re-enable tests * kokkos#5333: Add test for Scalar with size of 1 * Fix formatting * Remove unrelated changes * Extract TeamPolicy tests into a separate method * Disable new tests (OpenACC, OpenMPTarget, Serial, SYCL, point_t) * Use smaller number of teams for bhalf_t * Use pointer_type for m_scratch_space parameter * Skip failing tests (SYCL) * Fix tests for OpenMPTarget * Fix num_teams_done variable in SYCL TeamPolicy reduction * Restore guards for array_reduce<float, 7> * Also make array_reduce<float, 7> work for SYCL * tests: change point_t default constructor --------- Co-authored-by: Arkadiusz Szczepkowicz <arek.szczepkowicz@ng-analytics.com> Co-authored-by: Cezary Skrzyński <cezary.skrzynski@ng-analytics.com> Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
aprokop · Jan 3, 2024 · 02b46c0 · 02b46c0
1 parent f02539e
commit 02b46c0
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 32 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -625,6 +625,22 @@ class ParallelReduce<CombinedFunctorReducerType,
 
  public:
   using functor_type = FunctorType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>,
+      Kokkos::Cuda::size_type>;
   using size_type    = Cuda::size_type;
   using reducer_type = ReducerType;
 
@@ -648,9 +664,11 @@ class ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
   size_type m_team_begin;
   size_type m_shmem_begin;
   size_type m_shmem_size;
@@ -694,13 +712,14 @@ class ParallelReduce<CombinedFunctorReducerType,
   }
 
   __device__ inline void run(SHMEMReductionTag&, const int& threadid) const {
-    const integral_nonzero_constant<
-        size_type, ReducerType::static_value_size() / sizeof(size_type)>
+    const integral_nonzero_constant<word_size_type,
+                                    ReducerType::static_value_size() /
+                                        sizeof(word_size_type)>
         word_count(m_functor_reducer.get_reducer().value_size() /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     reference_type value = m_functor_reducer.get_reducer().init(
-        kokkos_impl_cuda_shared_memory<size_type>() +
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
         threadIdx.y * word_count.value);
 
     // Iterate this block through the league
@@ -723,18 +742,19 @@ class ParallelReduce<CombinedFunctorReducerType,
     if (!zero_length)
       do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
           m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
-          kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+          kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
           m_scratch_flags);
 
     if (do_final_reduction) {
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
       size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -784,7 +804,8 @@ class ParallelReduce<CombinedFunctorReducerType,
       *result = value;
     } else if (Impl::cuda_inter_block_reduction(
                    value, init, m_functor_reducer.get_reducer(),
-                   m_scratch_space, result, m_scratch_flags, blockDim.y)) {
+                   reinterpret_cast<pointer_type>(m_scratch_space), result,
+                   m_scratch_flags, blockDim.y)) {
       const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
       if (id == 0) {
         m_functor_reducer.get_reducer().final(&value);
@@ -805,13 +826,15 @@ class ParallelReduce<CombinedFunctorReducerType,
           1u, UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
                                : std::min(int(m_league_size), m_team_size));
 
-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(),
-          m_functor_reducer.get_reducer().value_size() * block_count);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(),
+              m_functor_reducer.get_reducer().value_size() * block_count));
       m_scratch_flags =
           cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), m_functor_reducer.get_reducer().value_size());
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(), m_functor_reducer.get_reducer().value_size()));
 
       dim3 block(m_vector_size, m_team_size, 1);
       dim3 grid(block_count, 1, 1);

diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -103,7 +103,7 @@ template <class FunctorType>
 __device__ bool cuda_inter_block_reduction(
     typename FunctorType::reference_type value,
     typename FunctorType::reference_type neutral, const FunctorType& reducer,
-    Cuda::size_type* const m_scratch_space,
+    typename FunctorType::pointer_type const m_scratch_space,
     typename FunctorType::pointer_type const /*result*/,
     Cuda::size_type* const m_scratch_flags,
     const int max_active_thread = blockDim.y) {
@@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction(
 
   // One thread in the block writes block result to global scratch_memory
   if (id == 0) {
-    pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x;
+    pointer_type global = m_scratch_space + blockIdx.x;
     *global             = value;
   }
 
@@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction(
       last_block = true;
       value      = neutral;
 
-      pointer_type const volatile global = (pointer_type)m_scratch_space;
+      pointer_type const volatile global = m_scratch_space;
 
       // Reduce all global values with splitting work over threads in one warp
       const int step_size =

diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
@@ -170,6 +170,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         const auto league_size       = m_league_size;
         const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
         sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
+        sycl::local_accessor<unsigned int> num_teams_done(1, cgh);
 
         auto team_reduction_factory =
             [&](sycl::local_accessor<value_type, 1> local_mem,
@@ -186,8 +187,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                 auto group_id = item.get_group_linear_id();
                 auto size     = n_wgroups * wgroup_size;
 
-                auto& num_teams_done = reinterpret_cast<unsigned int&>(
-                    local_mem[wgroup_size * std::max(value_count, 1u)]);
                 const auto local_id = item.get_local_linear_id();
                 const CombinedFunctorReducerType& functor_reducer =
                     functor_reducer_wrapper.get_functor();
@@ -225,10 +224,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                                      sycl::memory_scope::device,
                                      sycl::access::address_space::global_space>
                         scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
+                    num_teams_done[0] = ++scratch_flags_ref;
                   }
                   sycl::group_barrier(item.get_group());
-                  if (num_teams_done == n_wgroups) {
+                  if (num_teams_done[0] == n_wgroups) {
                     if (local_id >= n_wgroups)
                       reducer.init(&local_mem[local_id * value_count]);
                     else {
@@ -277,10 +276,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
                                      sycl::memory_scope::device,
                                      sycl::access::address_space::global_space>
                         scratch_flags_ref(*scratch_flags);
-                    num_teams_done = ++scratch_flags_ref;
+                    num_teams_done[0] = ++scratch_flags_ref;
                   }
                   item.barrier(sycl::access::fence_space::local_space);
-                  if (num_teams_done == n_wgroups) {
+                  if (num_teams_done[0] == n_wgroups) {
                     if (local_id >= n_wgroups)
                       reducer.init(&local_value);
                     else {
@@ -324,10 +323,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         auto wgroup_size = m_team_size * final_vector_size;
         std::size_t size = std::size_t(m_league_size) * wgroup_size;
         sycl::local_accessor<value_type, 1> local_mem(
-            sycl::range<1>(wgroup_size) * std::max(value_count, 1u) +
-                (sizeof(unsigned int) + sizeof(value_type) - 1) /
-                    sizeof(value_type),
-            cgh);
+            sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh);
 
         const auto init_size =
             std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);

diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -140,9 +140,14 @@ class SYCLTeamMember {
     }
     value = sg.shuffle(value, 0);
 
+    const auto n_subgroups = sg.get_group_range()[0];
+    if (n_subgroups == 1) {
+      reducer.reference() = value;
+      return;
+    }
+
     // We need to chunk up the whole reduction because we might not have
     // allocated enough memory.
-    const auto n_subgroups = sg.get_group_range()[0];
     const unsigned int maximum_work_range =
         std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);
 

diff --git a/core/unit_test/TestNonTrivialScalarTypes.hpp b/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -214,7 +214,7 @@ struct point_t {
   uint8_t x, y, z;
 
   KOKKOS_FUNCTION
-  point_t() : x(1), y(1), z(1){};
+  point_t() : x(0), y(0), z(0){};
 
   KOKKOS_FUNCTION
   point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};

diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp
@@ -19,6 +19,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
+#include <TestNonTrivialScalarTypes.hpp>
 
 //--------------------------------------------------------------------------
 
@@ -46,6 +47,15 @@ struct TestReducers {
     void operator()(const int& i, Scalar& value) const { value += values(i); }
   };
 
+  struct TeamSumFunctor {
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type& m, Scalar& value) const {
+      if (m.team_rank() == m.team_size() - 1) value += Scalar(1);
+    }
+  };
+
   struct ProdFunctor {
     Kokkos::View<const Scalar*, ExecSpace> values;
 
@@ -319,6 +329,77 @@ struct TestReducers {
       value = value || values(i);
     }
   };
+
+  // get number of teams for TeamPolicy depending on the tested type
+  constexpr static int get_num_teams() {
+    if constexpr (sizeof(Scalar) == 1) {
+      return 126;
+    } else if constexpr (std::is_same_v<Scalar,
+                                        Kokkos::Experimental::bhalf_t>) {
+      return 256;
+    }
+
+    return 1024;
+  }
+
+  static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) {
+#ifdef KOKKOS_ENABLE_OPENACC
+    if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC> &&
+                  (std::is_same_v<Scalar, size_t> ||
+                   std::is_same_v<Scalar, double>)) {
+      return;  // FIXME_OPENACC
+    }
+#endif
+
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+
+    Scalar sum_scalar;
+    Kokkos::View<Scalar, ExecSpace> sum_view("result");
+    Kokkos::deep_copy(sum_view, Scalar(1));
+
+    constexpr int num_teams = get_num_teams();
+    TeamSumFunctor tf;
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+    auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, Kokkos::AUTO);
+#else
+    auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, 1);
+#endif
+    Kokkos::parallel_reduce(team_pol, tf, sum_view);
+    Kokkos::deep_copy(sum_scalar, sum_view);
+    ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams;
+
+    Kokkos::parallel_for(
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+        Kokkos::TeamPolicy<ExecSpace>(1, Kokkos::AUTO),
+#else
+        Kokkos::TeamPolicy<ExecSpace>(1, 1),
+#endif
+        KOKKOS_LAMBDA(member_type team_member) {
+          Scalar local_scalar;
+          Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar(
+              local_scalar);
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, 0), f,
+                                  reducer_scalar);
+          sum_view() = local_scalar;
+        });
+    Kokkos::deep_copy(sum_scalar, sum_view);
+    ASSERT_EQ(sum_scalar, Scalar{0}) << "N: " << N;
+
+    auto team_size = std::min(128, TEST_EXECSPACE().concurrency());
+    Kokkos::parallel_for(
+        Kokkos::TeamPolicy<ExecSpace>(10, team_size),
+        KOKKOS_LAMBDA(member_type team_member) {
+          Scalar local_scalar;
+          Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar(
+              local_scalar);
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, N), f,
+                                  reducer_scalar);
+          sum_view() = local_scalar;
+        });
+    Kokkos::deep_copy(sum_scalar, sum_view);
+    ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N;
+  }
+
   static void test_sum(int N) {
     Kokkos::View<Scalar*, ExecSpace> values("Values", N);
     auto h_values        = Kokkos::create_mirror_view(values);
@@ -374,6 +455,8 @@ struct TestReducers {
       ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N;
     }
 
+    test_sum_team_policy(N, f, reference_sum);
+
     {
       Kokkos::View<Scalar, Kokkos::HostSpace> sum_view("View");
       sum_view() = Scalar(1);

diff --git a/core/unit_test/TestReducers_d.hpp b/core/unit_test/TestReducers_d.hpp
@@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) {
   TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
 }
 
+TEST(TEST_CATEGORY, reducers_int16_t) {
+  using ThisTestType = int16_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
 #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
 // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
 //                 implicitly-deleted default constructor of 'conv_type'