Skip to content

Commit

Permalink
kokkos#5333: CUDA: Use scratch space appropriate to small reduction e…
Browse files Browse the repository at this point in the history
…lements in Team reductions (kokkos#5334)

* kokkos#5333: Add maybe failing test case?

* Revise test to try to target the right code

* NOMERGE kokkos#5333: Partial implementation of fix

* kokkos#5333: Update tests

* kokkos#5333: Fix comparision types in reducers test

* kokkos#5333: Re-enable tests

* kokkos#5333: Add test for Scalar with size of 1

* Fix formatting

* Remove unrelated changes

* Extract TeamPolicy tests into a separate method

* Disable new tests (OpenACC, OpenMPTarget, Serial, SYCL, point_t)

* Use smaller number of teams for bhalf_t

* Use pointer_type for m_scratch_space parameter

* Skip failing tests (SYCL)

* Fix tests for OpenMPTarget

* Fix num_teams_done variable in SYCL TeamPolicy reduction

* Restore guards for array_reduce<float, 7>

* Also make array_reduce<float, 7> work for SYCL

* tests: change point_t default constructor

---------

Co-authored-by: Arkadiusz Szczepkowicz <arek.szczepkowicz@ng-analytics.com>
Co-authored-by: Cezary Skrzyński <cezary.skrzynski@ng-analytics.com>
Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
  • Loading branch information
4 people committed Jan 3, 2024
1 parent f02539e commit 02b46c0
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 32 deletions.
57 changes: 40 additions & 17 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,22 @@ class ParallelReduce<CombinedFunctorReducerType,

public:
using functor_type = FunctorType;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the reduction is performed.
// Within the reduction, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the reduction, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>,
Kokkos::Cuda::size_type>;
using size_type = Cuda::size_type;
using reducer_type = ReducerType;

Expand All @@ -648,9 +664,11 @@ class ParallelReduce<CombinedFunctorReducerType,
const pointer_type m_result_ptr;
const bool m_result_ptr_device_accessible;
const bool m_result_ptr_host_accessible;
size_type* m_scratch_space;
size_type* m_scratch_flags;
size_type* m_unified_space;
word_size_type* m_scratch_space;
// m_scratch_flags must be of type Cuda::size_type due to use of atomics
// for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
Cuda::size_type* m_scratch_flags;
word_size_type* m_unified_space;
size_type m_team_begin;
size_type m_shmem_begin;
size_type m_shmem_size;
Expand Down Expand Up @@ -694,13 +712,14 @@ class ParallelReduce<CombinedFunctorReducerType,
}

__device__ inline void run(SHMEMReductionTag&, const int& threadid) const {
const integral_nonzero_constant<
size_type, ReducerType::static_value_size() / sizeof(size_type)>
const integral_nonzero_constant<word_size_type,
ReducerType::static_value_size() /
sizeof(word_size_type)>
word_count(m_functor_reducer.get_reducer().value_size() /
sizeof(size_type));
sizeof(word_size_type));

reference_type value = m_functor_reducer.get_reducer().init(
kokkos_impl_cuda_shared_memory<size_type>() +
kokkos_impl_cuda_shared_memory<word_size_type>() +
threadIdx.y * word_count.value);

// Iterate this block through the league
Expand All @@ -723,18 +742,19 @@ class ParallelReduce<CombinedFunctorReducerType,
if (!zero_length)
do_final_reduction = cuda_single_inter_block_reduce_scan<false>(
m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags);

if (do_final_reduction) {
// This is the final block with the final result at the final threads'
// location

size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
(blockDim.y - 1) * word_count.value;
word_size_type* const shared =
kokkos_impl_cuda_shared_memory<word_size_type>() +
(blockDim.y - 1) * word_count.value;
size_type* const global =
m_result_ptr_device_accessible
? reinterpret_cast<size_type*>(m_result_ptr)
? reinterpret_cast<word_size_type*>(m_result_ptr)
: (m_unified_space ? m_unified_space : m_scratch_space);

if (threadIdx.y == 0) {
Expand Down Expand Up @@ -784,7 +804,8 @@ class ParallelReduce<CombinedFunctorReducerType,
*result = value;
} else if (Impl::cuda_inter_block_reduction(
value, init, m_functor_reducer.get_reducer(),
m_scratch_space, result, m_scratch_flags, blockDim.y)) {
reinterpret_cast<pointer_type>(m_scratch_space), result,
m_scratch_flags, blockDim.y)) {
const unsigned id = threadIdx.y * blockDim.x + threadIdx.x;
if (id == 0) {
m_functor_reducer.get_reducer().final(&value);
Expand All @@ -805,13 +826,15 @@ class ParallelReduce<CombinedFunctorReducerType,
1u, UseShflReduction ? std::min(m_league_size, size_type(1024 * 32))
: std::min(int(m_league_size), m_team_size));

m_scratch_space = cuda_internal_scratch_space(
m_policy.space(),
m_functor_reducer.get_reducer().value_size() * block_count);
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(),
m_functor_reducer.get_reducer().value_size() * block_count));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
m_unified_space = cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size());
m_unified_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
m_policy.space(), m_functor_reducer.get_reducer().value_size()));

dim3 block(m_vector_size, m_team_size, 1);
dim3 grid(block_count, 1, 1);
Expand Down
6 changes: 3 additions & 3 deletions core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <class FunctorType>
__device__ bool cuda_inter_block_reduction(
typename FunctorType::reference_type value,
typename FunctorType::reference_type neutral, const FunctorType& reducer,
Cuda::size_type* const m_scratch_space,
typename FunctorType::pointer_type const m_scratch_space,
typename FunctorType::pointer_type const /*result*/,
Cuda::size_type* const m_scratch_flags,
const int max_active_thread = blockDim.y) {
Expand All @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction(

// One thread in the block writes block result to global scratch_memory
if (id == 0) {
pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x;
pointer_type global = m_scratch_space + blockIdx.x;
*global = value;
}

Expand All @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction(
last_block = true;
value = neutral;

pointer_type const volatile global = (pointer_type)m_scratch_space;
pointer_type const volatile global = m_scratch_space;

// Reduce all global values with splitting work over threads in one warp
const int step_size =
Expand Down
16 changes: 6 additions & 10 deletions core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
const auto league_size = m_league_size;
const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr;
sycl::local_accessor<unsigned int> num_teams_done(1, cgh);

auto team_reduction_factory =
[&](sycl::local_accessor<value_type, 1> local_mem,
Expand All @@ -186,8 +187,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
auto group_id = item.get_group_linear_id();
auto size = n_wgroups * wgroup_size;

auto& num_teams_done = reinterpret_cast<unsigned int&>(
local_mem[wgroup_size * std::max(value_count, 1u)]);
const auto local_id = item.get_local_linear_id();
const CombinedFunctorReducerType& functor_reducer =
functor_reducer_wrapper.get_functor();
Expand Down Expand Up @@ -225,10 +224,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
sycl::memory_scope::device,
sycl::access::address_space::global_space>
scratch_flags_ref(*scratch_flags);
num_teams_done = ++scratch_flags_ref;
num_teams_done[0] = ++scratch_flags_ref;
}
sycl::group_barrier(item.get_group());
if (num_teams_done == n_wgroups) {
if (num_teams_done[0] == n_wgroups) {
if (local_id >= n_wgroups)
reducer.init(&local_mem[local_id * value_count]);
else {
Expand Down Expand Up @@ -277,10 +276,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
sycl::memory_scope::device,
sycl::access::address_space::global_space>
scratch_flags_ref(*scratch_flags);
num_teams_done = ++scratch_flags_ref;
num_teams_done[0] = ++scratch_flags_ref;
}
item.barrier(sycl::access::fence_space::local_space);
if (num_teams_done == n_wgroups) {
if (num_teams_done[0] == n_wgroups) {
if (local_id >= n_wgroups)
reducer.init(&local_value);
else {
Expand Down Expand Up @@ -324,10 +323,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
auto wgroup_size = m_team_size * final_vector_size;
std::size_t size = std::size_t(m_league_size) * wgroup_size;
sycl::local_accessor<value_type, 1> local_mem(
sycl::range<1>(wgroup_size) * std::max(value_count, 1u) +
(sizeof(unsigned int) + sizeof(value_type) - 1) /
sizeof(value_type),
cgh);
sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh);

const auto init_size =
std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
Expand Down
7 changes: 6 additions & 1 deletion core/src/SYCL/Kokkos_SYCL_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,14 @@ class SYCLTeamMember {
}
value = sg.shuffle(value, 0);

const auto n_subgroups = sg.get_group_range()[0];
if (n_subgroups == 1) {
reducer.reference() = value;
return;
}

// We need to chunk up the whole reduction because we might not have
// allocated enough memory.
const auto n_subgroups = sg.get_group_range()[0];
const unsigned int maximum_work_range =
std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups);

Expand Down
2 changes: 1 addition & 1 deletion core/unit_test/TestNonTrivialScalarTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ struct point_t {
uint8_t x, y, z;

KOKKOS_FUNCTION
point_t() : x(1), y(1), z(1){};
point_t() : x(0), y(0), z(0){};

KOKKOS_FUNCTION
point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
Expand Down
83 changes: 83 additions & 0 deletions core/unit_test/TestReducers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <limits>

#include <Kokkos_Core.hpp>
#include <TestNonTrivialScalarTypes.hpp>

//--------------------------------------------------------------------------

Expand Down Expand Up @@ -46,6 +47,15 @@ struct TestReducers {
void operator()(const int& i, Scalar& value) const { value += values(i); }
};

struct TeamSumFunctor {
using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;

KOKKOS_INLINE_FUNCTION
void operator()(const member_type& m, Scalar& value) const {
if (m.team_rank() == m.team_size() - 1) value += Scalar(1);
}
};

struct ProdFunctor {
Kokkos::View<const Scalar*, ExecSpace> values;

Expand Down Expand Up @@ -319,6 +329,77 @@ struct TestReducers {
value = value || values(i);
}
};

// get number of teams for TeamPolicy depending on the tested type
constexpr static int get_num_teams() {
if constexpr (sizeof(Scalar) == 1) {
return 126;
} else if constexpr (std::is_same_v<Scalar,
Kokkos::Experimental::bhalf_t>) {
return 256;
}

return 1024;
}

static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) {
#ifdef KOKKOS_ENABLE_OPENACC
if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC> &&
(std::is_same_v<Scalar, size_t> ||
std::is_same_v<Scalar, double>)) {
return; // FIXME_OPENACC
}
#endif

using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;

Scalar sum_scalar;
Kokkos::View<Scalar, ExecSpace> sum_view("result");
Kokkos::deep_copy(sum_view, Scalar(1));

constexpr int num_teams = get_num_teams();
TeamSumFunctor tf;
#ifdef KOKKOS_ENABLE_OPENMPTARGET
auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, Kokkos::AUTO);
#else
auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, 1);
#endif
Kokkos::parallel_reduce(team_pol, tf, sum_view);
Kokkos::deep_copy(sum_scalar, sum_view);
ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams;

Kokkos::parallel_for(
#ifdef KOKKOS_ENABLE_OPENMPTARGET
Kokkos::TeamPolicy<ExecSpace>(1, Kokkos::AUTO),
#else
Kokkos::TeamPolicy<ExecSpace>(1, 1),
#endif
KOKKOS_LAMBDA(member_type team_member) {
Scalar local_scalar;
Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar(
local_scalar);
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, 0), f,
reducer_scalar);
sum_view() = local_scalar;
});
Kokkos::deep_copy(sum_scalar, sum_view);
ASSERT_EQ(sum_scalar, Scalar{0}) << "N: " << N;

auto team_size = std::min(128, TEST_EXECSPACE().concurrency());
Kokkos::parallel_for(
Kokkos::TeamPolicy<ExecSpace>(10, team_size),
KOKKOS_LAMBDA(member_type team_member) {
Scalar local_scalar;
Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar(
local_scalar);
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, N), f,
reducer_scalar);
sum_view() = local_scalar;
});
Kokkos::deep_copy(sum_scalar, sum_view);
ASSERT_EQ(sum_scalar, reference_sum) << "N: " << N;
}

static void test_sum(int N) {
Kokkos::View<Scalar*, ExecSpace> values("Values", N);
auto h_values = Kokkos::create_mirror_view(values);
Expand Down Expand Up @@ -374,6 +455,8 @@ struct TestReducers {
ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N;
}

test_sum_team_policy(N, f, reference_sum);

{
Kokkos::View<Scalar, Kokkos::HostSpace> sum_view("View");
sum_view() = Scalar(1);
Expand Down
14 changes: 14 additions & 0 deletions core/unit_test/TestReducers_d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) {
TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
}

TEST(TEST_CATEGORY, reducers_int16_t) {
using ThisTestType = int16_t;

TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);

TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
}

#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
// TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
// implicitly-deleted default constructor of 'conv_type'
Expand Down

0 comments on commit 02b46c0

Please sign in to comment.