Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 69 additions & 39 deletions arcane/src/arcane/accelerator/CommonCudaHipReduceImpl.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
//-----------------------------------------------------------------------------
// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// See the top-level COPYRIGHT file for details.
// SPDX-License-Identifier: Apache-2.0
//-----------------------------------------------------------------------------
/*---------------------------------------------------------------------------*/
/* CommonCudaHipReduceImpl.h (C) 2000-2024 */
/* CommonCudaHipReduceImpl.h (C) 2000-2025 */
/* */
/* Implémentation CUDA et HIP des réductions. */
/*---------------------------------------------------------------------------*/
Expand Down Expand Up @@ -33,7 +33,7 @@ namespace Arcane::Accelerator::impl
__device__ __forceinline__ unsigned int getThreadId()
{
int threadId = threadIdx.x + blockDim.x * threadIdx.y +
(blockDim.x * blockDim.y) * threadIdx.z;
(blockDim.x * blockDim.y) * threadIdx.z;
return threadId;
}

Expand All @@ -43,13 +43,7 @@ __device__ __forceinline__ unsigned int getBlockId()
return blockId;
}

#if defined(__HIP__)
constexpr const Int32 WARP_SIZE = warpSize;
#else
constexpr const Int32 WARP_SIZE = 32;
#endif
constexpr const Int32 MAX_BLOCK_SIZE = 1024;
constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;

template <typename T, enum eAtomicOperation>
class SimpleReduceOperator;
Expand Down Expand Up @@ -154,9 +148,11 @@ ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
/*---------------------------------------------------------------------------*/
// Cette implémentation est celle de RAJA
//! reduce values in block into thread 0
template <typename ReduceOperator,typename T>
template <typename ReduceOperator, Int32 WarpSize, typename T>
ARCCORE_DEVICE inline T block_reduce(T val, T identity)
{
constexpr Int32 WARP_SIZE = WarpSize;
constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
int numThreads = blockDim.x * blockDim.y * blockDim.z;

int threadId = getThreadId();
Expand All @@ -173,8 +169,8 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity)
T rhs = impl::shfl_xor_sync(temp, i);
ReduceOperator::apply(temp, rhs);
}

} else {
}
else {

// reduce each warp
for (int i = 1; i < WARP_SIZE; i *= 2) {
Expand Down Expand Up @@ -206,7 +202,8 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity)
// read per warp values
if (warpId * WARP_SIZE < numThreads) {
temp = sd[warpId];
} else {
}
else {
temp = identity;
}
for (int i = 1; i < WARP_SIZE; i *= 2) {
Expand All @@ -224,21 +221,21 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity)
/*---------------------------------------------------------------------------*/
//! reduce values in grid into thread 0 of last running block
// returns true if put reduced value in val
template <typename ReduceOperator, typename T>
template <typename ReduceOperator, Int32 WarpSize, typename T>
ARCCORE_DEVICE inline bool
grid_reduce(T& val,T identity,SmallSpan<T> device_mem,unsigned int* device_count)
grid_reduce(T& val, T identity, SmallSpan<T> device_mem, unsigned int* device_count)
{
int numBlocks = gridDim.x * gridDim.y * gridDim.z;
int numThreads = blockDim.x * blockDim.y * blockDim.z;
int wrap_around = numBlocks - 1;

int blockId = blockIdx.x + gridDim.x * blockIdx.y +
(gridDim.x * gridDim.y) * blockIdx.z;
(gridDim.x * gridDim.y) * blockIdx.z;

int threadId = threadIdx.x + blockDim.x * threadIdx.y +
(blockDim.x * blockDim.y) * threadIdx.z;
(blockDim.x * blockDim.y) * threadIdx.z;

T temp = block_reduce<ReduceOperator>(val, identity);
T temp = block_reduce<ReduceOperator, WarpSize>(val, identity);

// one thread per block writes to device_mem
bool lastBlock = false;
Expand Down Expand Up @@ -266,7 +263,7 @@ grid_reduce(T& val,T identity,SmallSpan<T> device_mem,unsigned int* device_count
ReduceOperator::apply(temp, device_mem[i]);
}

temp = block_reduce<ReduceOperator>(temp, identity);
temp = block_reduce<ReduceOperator, WarpSize>(temp, identity);

// one thread returns value
if (threadId == 0) {
Expand All @@ -280,43 +277,78 @@ grid_reduce(T& val,T identity,SmallSpan<T> device_mem,unsigned int* device_count
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

template<typename DataType,typename ReduceOperator,typename AtomicReduceOperator>
ARCANE_INLINE_REDUCE ARCCORE_DEVICE
void _applyDeviceGeneric(const ReduceDeviceInfo<DataType>& dev_info)
template <typename DataType, typename ReduceOperator, typename AtomicReduceOperator>
ARCANE_INLINE_REDUCE ARCCORE_DEVICE void _applyDeviceGeneric(const ReduceDeviceInfo<DataType>& dev_info)
{
SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
DataType identity = dev_info.m_identity;
unsigned int* device_count = dev_info.m_device_count;
DataType* ptr = dev_info.m_device_final_ptr;
DataType v = dev_info.m_current_value;
bool do_grid_reduce = dev_info.m_use_grid_reduce;
#if HIP_VERSION_MAJOR >= 7
// A partir de ROCM 7, il n'est pas possible de savoir à la compilation
// la taille d'un warp. C'est 32 ou 64. Pour contourner ce problème,
// on utilise deux instantiations de la reduction et on choisit
// dynamiquement. C'est probablement un peu moins performant qu'avec
// l'ancien mécanisme. Une autre solution serait de choisir à la
// compilation la taille d'un warp. Cela est possible sur les architectures
// HPC comme les MI300 car cette valeur est fixe. Mais sur les architectures
// RDNA les deux valeurs sont possibles.
const Int32 warp_size = dev_info.m_warp_size;
#else
#if defined(__HIP__)
constexpr const Int32 WARP_SIZE = warpSize;
#else
constexpr const Int32 WARP_SIZE = 32;
#endif
#endif

//if (impl::getThreadId()==0){
// printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
// getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
// (void*)device_count,(do_grid_reduce)?1:0);
//}
if (do_grid_reduce){
bool is_done = grid_reduce<ReduceOperator>(v,identity,grid_buffer,device_count);
if (is_done){
if (do_grid_reduce) {
#if HIP_VERSION_MAJOR >= 7
bool is_done = false;
if (warp_size == 64)
is_done = grid_reduce<ReduceOperator, 64>(v, identity, grid_buffer, device_count);
else if (warp_size == 32)
is_done = grid_reduce<ReduceOperator, 32>(v, identity, grid_buffer, device_count);
else
assert("Bad warp size (should be 32 or 64)");
#else
bool is_done = grid_reduce<ReduceOperator, WARP_SIZE>(v, identity, grid_buffer, device_count);
#endif
if (is_done) {
*ptr = v;
// Il est important de remettre cette à zéro pour la prochaine utilisation d'un Reducer.
// Il est important de remettre cette variable à zéro pour la prochaine utilisation d'un Reducer.
(*device_count) = 0;
}
}
else{
DataType rv = impl::block_reduce<ReduceOperator>(v,identity);
if (impl::getThreadId()==0){
AtomicReduceOperator::apply(ptr,rv);
else {
#if HIP_VERSION_MAJOR >= 7
DataType rv;
if (warp_size == 64)
rv = impl::block_reduce<ReduceOperator, 64>(v, identity);
else if (warp_size == 32)
rv = impl::block_reduce<ReduceOperator, 32>(v, identity);
else
assert("Bad warp size (should be 32 or 64)");
#else
DataType rv = impl::block_reduce<ReduceOperator, WARP_SIZE>(v, identity);
#endif
if (impl::getThreadId() == 0) {
AtomicReduceOperator::apply(ptr, rv);
}
}
}

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
void ReduceFunctorSum<DataType>::
template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorSum<DataType>::
_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
{
using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
Expand All @@ -327,20 +359,18 @@ _applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

template<typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
void ReduceFunctorMax<DataType>::
template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMax<DataType>::
_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
{
using ReduceOperator = impl::SimpleReduceOperator<DataType,eAtomicOperation::Max>;
using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType,eAtomicOperation::Max>;
_applyDeviceGeneric<DataType,ReduceOperator,AtomicReduceOperator>(dev_info);
using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType, eAtomicOperation::Max>;
_applyDeviceGeneric<DataType, ReduceOperator, AtomicReduceOperator>(dev_info);
}

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
void ReduceFunctorMin<DataType>::
template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMin<DataType>::
_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
{
using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
Expand Down
4 changes: 4 additions & 0 deletions arcane/src/arcane/accelerator/Reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ class ReduceDeviceInfo

//! Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
bool m_use_grid_reduce = true;

//! Taille d'un warp
Int32 m_warp_size = 0;
};

/*---------------------------------------------------------------------------*/
Expand Down Expand Up @@ -440,6 +443,7 @@ class HostDeviceReducerBase
dvi.m_current_value = m_local_value;
dvi.m_identity = m_identity;
dvi.m_use_grid_reduce = m_grid_memory_info.m_reduce_policy != eDeviceReducePolicy::Atomic;
dvi.m_warp_size = m_grid_memory_info.m_warp_size;
ReduceFunctor::applyDevice(dvi); //grid_buffer,m_grid_device_count,m_host_or_device_memory_for_reduced_value,m_local_value,m_identity);
#else
// printf("Destroy host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);
Expand Down
9 changes: 7 additions & 2 deletions arcane/src/arcane/accelerator/core/DeviceInfo.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
//-----------------------------------------------------------------------------
// Copyright 2000-2022 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// See the top-level COPYRIGHT file for details.
// SPDX-License-Identifier: Apache-2.0
//-----------------------------------------------------------------------------
/*---------------------------------------------------------------------------*/
/* DeviceInfo.h (C) 2000-2022 */
/* DeviceInfo.h (C) 2000-2025 */
/* */
/* Information sur un device. */
/*---------------------------------------------------------------------------*/
Expand Down Expand Up @@ -44,19 +44,24 @@ class ARCANE_ACCELERATOR_CORE_EXPORT DeviceInfo
//! Description du device.
String description() const { return m_description; }

//! Taille d'un warp
Int32 warpSize() const { return m_warp_size; }

public:

void setDeviceId(DeviceId id) { m_device_id = id; }
void setUUIDAsString(const String& v) { m_uuid_as_string = v; }
void setDescription(const String& v) { m_description = v; }
void setName(const String& v) { m_name = v; }
void setWarpSize(Int32 v) { m_warp_size = v; }

private:

DeviceId m_device_id;
String m_name;
String m_uuid_as_string;
String m_description;
Int32 m_warp_size = 0;
};

/*---------------------------------------------------------------------------*/
Expand Down
14 changes: 6 additions & 8 deletions arcane/src/arcane/accelerator/core/IReduceMemoryImpl.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
//-----------------------------------------------------------------------------
// Copyright 2000-2023 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// See the top-level COPYRIGHT file for details.
// SPDX-License-Identifier: Apache-2.0
//-----------------------------------------------------------------------------
/*---------------------------------------------------------------------------*/
/* IReduceMemoryImpl.h (C) 2000-2023 */
/* IReduceMemoryImpl.h (C) 2000-2025 */
/* */
/* Interface de la gestion mémoire pour les réductions. */
/*---------------------------------------------------------------------------*/
#ifndef ARCANE_ACCELERATOR_IREDUCEMEMORYIMPL_H
#define ARCANE_ACCELERATOR_IREDUCEMEMORYIMPL_H
#ifndef ARCANE_ACCELERATOR_CORE_IREDUCEMEMORYIMPL_H
#define ARCANE_ACCELERATOR_CORE_IREDUCEMEMORYIMPL_H
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"

#include "arcane/utils/MemoryView.h"

#include <stack>

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

Expand All @@ -40,8 +38,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl
//! Informations mémoire pour la réduction sur les accélérateurs
struct GridMemoryInfo
{
public:

//! Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
MutableMemoryView m_grid_memory_values;
//! Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction
Expand All @@ -50,6 +46,8 @@ class ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl
eDeviceReducePolicy m_reduce_policy = eDeviceReducePolicy::Grid;
//! Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
void* m_host_memory_for_reduced_value = nullptr;
//! Taille d'un warp
Int32 m_warp_size = 64;
};

public:
Expand Down
5 changes: 3 additions & 2 deletions arcane/src/arcane/accelerator/core/ReduceMemoryImpl.cc
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
//-----------------------------------------------------------------------------
// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
// See the top-level COPYRIGHT file for details.
// SPDX-License-Identifier: Apache-2.0
//-----------------------------------------------------------------------------
/*---------------------------------------------------------------------------*/
/* ReduceMemoryImpl.cc (C) 2000-2024 */
/* ReduceMemoryImpl.cc (C) 2000-2025 */
/* */
/* Gestion de la mémoire pour les réductions. */
/*---------------------------------------------------------------------------*/
Expand Down Expand Up @@ -49,6 +49,7 @@ ReduceMemoryImpl(RunCommandImpl* p)
{
_allocateMemoryForReduceData(128);
_allocateMemoryForGridDeviceCount();
m_grid_memory_info.m_warp_size = p->runner()->deviceInfo().warpSize();
}

/*---------------------------------------------------------------------------*/
Expand Down
1 change: 1 addition & 0 deletions arcane/src/arcane/accelerator/core/Runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ initialize(Runner* runner, eExecutionPolicy v, DeviceId device)
m_execution_policy = v;
m_device_id = device;
m_runtime = _getRuntime(v);
m_device_info = m_runtime->deviceInfoList()->deviceInfo(m_device_id.asInt32());
m_is_init = true;
m_is_auto_prefetch_command = false;

Expand Down
4 changes: 3 additions & 1 deletion arcane/src/arcane/accelerator/core/internal/RunnerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "arcane/accelerator/core/AcceleratorCoreGlobal.h"

#include "arcane/accelerator/core/DeviceId.h"
#include "arcane/accelerator/core/DeviceInfo.h"
#include "arcane/accelerator/core/internal/RunnerInternal.h"

#include <stack>
Expand Down Expand Up @@ -125,6 +125,7 @@ class RunnerImpl
bool isInit() const { return m_is_init; }
eDeviceReducePolicy reducePolicy() const { return m_reduce_policy; }
DeviceId deviceId() const { return m_device_id; }
const DeviceInfo& deviceInfo() const { return m_device_info; }

public:

Expand All @@ -141,6 +142,7 @@ class RunnerImpl
bool m_is_init = false;
const eDeviceReducePolicy m_reduce_policy = eDeviceReducePolicy::Grid;
DeviceId m_device_id;
DeviceInfo m_device_info;
impl::IRunnerRuntime* m_runtime = nullptr;
RunQueueImplStack m_run_queue_pool;
std::mutex m_pool_mutex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ fillDevices(bool is_verbose)
device_info.setDescription(description);
device_info.setDeviceId(DeviceId(i));
device_info.setName(dp.name);
device_info.setWarpSize(dp.warpSize);
m_device_info_list.addDevice(device_info);
}

Expand Down
Loading
Loading