diff --git a/arcane/src/arcane/accelerator/CommonCudaHipReduceImpl.h b/arcane/src/arcane/accelerator/CommonCudaHipReduceImpl.h index db45f03a11..6095c5a978 100644 --- a/arcane/src/arcane/accelerator/CommonCudaHipReduceImpl.h +++ b/arcane/src/arcane/accelerator/CommonCudaHipReduceImpl.h @@ -1,11 +1,11 @@ // -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*- //----------------------------------------------------------------------------- -// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) +// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 //----------------------------------------------------------------------------- /*---------------------------------------------------------------------------*/ -/* CommonCudaHipReduceImpl.h (C) 2000-2024 */ +/* CommonCudaHipReduceImpl.h (C) 2000-2025 */ /* */ /* Implémentation CUDA et HIP des réductions. */ /*---------------------------------------------------------------------------*/ @@ -33,7 +33,7 @@ namespace Arcane::Accelerator::impl __device__ __forceinline__ unsigned int getThreadId() { int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + (blockDim.x * blockDim.y) * threadIdx.z; return threadId; } @@ -43,13 +43,7 @@ __device__ __forceinline__ unsigned int getBlockId() return blockId; } -#if defined(__HIP__) -constexpr const Int32 WARP_SIZE = warpSize; -#else -constexpr const Int32 WARP_SIZE = 32; -#endif constexpr const Int32 MAX_BLOCK_SIZE = 1024; -constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE; template class SimpleReduceOperator; @@ -154,9 +148,11 @@ ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask) /*---------------------------------------------------------------------------*/ // Cette implémentation est celle de RAJA //! reduce values in block into thread 0 -template +template ARCCORE_DEVICE inline T block_reduce(T val, T identity) { + constexpr Int32 WARP_SIZE = WarpSize; + constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE; int numThreads = blockDim.x * blockDim.y * blockDim.z; int threadId = getThreadId(); @@ -173,8 +169,8 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity) T rhs = impl::shfl_xor_sync(temp, i); ReduceOperator::apply(temp, rhs); } - - } else { + } + else { // reduce each warp for (int i = 1; i < WARP_SIZE; i *= 2) { @@ -206,7 +202,8 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity) // read per warp values if (warpId * WARP_SIZE < numThreads) { temp = sd[warpId]; - } else { + } + else { temp = identity; } for (int i = 1; i < WARP_SIZE; i *= 2) { @@ -224,21 +221,21 @@ ARCCORE_DEVICE inline T block_reduce(T val, T identity) /*---------------------------------------------------------------------------*/ //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template +template ARCCORE_DEVICE inline bool -grid_reduce(T& val,T identity,SmallSpan device_mem,unsigned int* device_count) +grid_reduce(T& val, T identity, SmallSpan device_mem, unsigned int* device_count) { int numBlocks = gridDim.x * gridDim.y * gridDim.z; int numThreads = blockDim.x * blockDim.y * blockDim.z; int wrap_around = numBlocks - 1; int blockId = blockIdx.x + gridDim.x * blockIdx.y + - (gridDim.x * gridDim.y) * blockIdx.z; + (gridDim.x * gridDim.y) * blockIdx.z; int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + (blockDim.x * blockDim.y) * threadIdx.z; - T temp = block_reduce(val, identity); + T temp = block_reduce(val, identity); // one thread per block writes to device_mem bool lastBlock = false; @@ -266,7 +263,7 @@ grid_reduce(T& val,T identity,SmallSpan device_mem,unsigned int* device_count ReduceOperator::apply(temp, device_mem[i]); } - temp = block_reduce(temp, identity); + temp = block_reduce(temp, identity); // one thread returns value if (threadId == 0) { @@ -280,9 +277,8 @@ grid_reduce(T& val,T identity,SmallSpan device_mem,unsigned int* device_count /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -template -ARCANE_INLINE_REDUCE ARCCORE_DEVICE -void _applyDeviceGeneric(const ReduceDeviceInfo& dev_info) +template +ARCANE_INLINE_REDUCE ARCCORE_DEVICE void _applyDeviceGeneric(const ReduceDeviceInfo& dev_info) { SmallSpan grid_buffer = dev_info.m_grid_buffer; DataType identity = dev_info.m_identity; @@ -290,24 +286,61 @@ void _applyDeviceGeneric(const ReduceDeviceInfo& dev_info) DataType* ptr = dev_info.m_device_final_ptr; DataType v = dev_info.m_current_value; bool do_grid_reduce = dev_info.m_use_grid_reduce; +#if HIP_VERSION_MAJOR >= 7 + // A partir de ROCM 7, il n'est pas possible de savoir à la compilation + // la taille d'un warp. C'est 32 ou 64. Pour contourner ce problème, + // on utilise deux instantiations de la reduction et on choisit + // dynamiquement. C'est probablement un peu moins performant qu'avec + // l'ancien mécanisme. Une autre solution serait de choisir à la + // compilation la taille d'un warp. Cela est possible sur les architectures + // HPC comme les MI300 car cette valeur est fixe. Mais sur les architectures + // RDNA les deux valeurs sont possibles. + const Int32 warp_size = dev_info.m_warp_size; +#else +#if defined(__HIP__) + constexpr const Int32 WARP_SIZE = warpSize; +#else + constexpr const Int32 WARP_SIZE = 32; +#endif +#endif //if (impl::getThreadId()==0){ // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n", // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr, // (void*)device_count,(do_grid_reduce)?1:0); //} - if (do_grid_reduce){ - bool is_done = grid_reduce(v,identity,grid_buffer,device_count); - if (is_done){ + if (do_grid_reduce) { +#if HIP_VERSION_MAJOR >= 7 + bool is_done = false; + if (warp_size == 64) + is_done = grid_reduce(v, identity, grid_buffer, device_count); + else if (warp_size == 32) + is_done = grid_reduce(v, identity, grid_buffer, device_count); + else + assert("Bad warp size (should be 32 or 64)"); +#else + bool is_done = grid_reduce(v, identity, grid_buffer, device_count); +#endif + if (is_done) { *ptr = v; - // Il est important de remettre cette à zéro pour la prochaine utilisation d'un Reducer. + // Il est important de remettre cette variable à zéro pour la prochaine utilisation d'un Reducer. (*device_count) = 0; } } - else{ - DataType rv = impl::block_reduce(v,identity); - if (impl::getThreadId()==0){ - AtomicReduceOperator::apply(ptr,rv); + else { +#if HIP_VERSION_MAJOR >= 7 + DataType rv; + if (warp_size == 64) + rv = impl::block_reduce(v, identity); + else if (warp_size == 32) + rv = impl::block_reduce(v, identity); + else + assert("Bad warp size (should be 32 or 64)"); +#else + DataType rv = impl::block_reduce(v, identity); +#endif + if (impl::getThreadId() == 0) { + AtomicReduceOperator::apply(ptr, rv); } } } @@ -315,8 +348,7 @@ void _applyDeviceGeneric(const ReduceDeviceInfo& dev_info) /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -template ARCANE_INLINE_REDUCE ARCCORE_DEVICE -void ReduceFunctorSum:: +template ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorSum:: _applyDevice(const ReduceDeviceInfo& dev_info) { using ReduceOperator = impl::SimpleReduceOperator; @@ -327,20 +359,18 @@ _applyDevice(const ReduceDeviceInfo& dev_info) /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -template ARCANE_INLINE_REDUCE ARCCORE_DEVICE -void ReduceFunctorMax:: +template ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMax:: _applyDevice(const ReduceDeviceInfo& dev_info) { - using ReduceOperator = impl::SimpleReduceOperator; - using AtomicReduceOperator = impl::CommonCudaHipAtomic; - _applyDeviceGeneric(dev_info); + using ReduceOperator = impl::SimpleReduceOperator; + using AtomicReduceOperator = impl::CommonCudaHipAtomic; + _applyDeviceGeneric(dev_info); } /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -template ARCANE_INLINE_REDUCE ARCCORE_DEVICE -void ReduceFunctorMin:: +template ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMin:: _applyDevice(const ReduceDeviceInfo& dev_info) { using ReduceOperator = impl::SimpleReduceOperator; diff --git a/arcane/src/arcane/accelerator/Reduce.h b/arcane/src/arcane/accelerator/Reduce.h index 6df8e698a7..36dec62004 100644 --- a/arcane/src/arcane/accelerator/Reduce.h +++ b/arcane/src/arcane/accelerator/Reduce.h @@ -110,6 +110,9 @@ class ReduceDeviceInfo //! Indique si on utilise la réduction par grille (sinon on utilise les atomiques) bool m_use_grid_reduce = true; + + //! Taille d'un warp + Int32 m_warp_size = 0; }; /*---------------------------------------------------------------------------*/ @@ -440,6 +443,7 @@ class HostDeviceReducerBase dvi.m_current_value = m_local_value; dvi.m_identity = m_identity; dvi.m_use_grid_reduce = m_grid_memory_info.m_reduce_policy != eDeviceReducePolicy::Atomic; + dvi.m_warp_size = m_grid_memory_info.m_warp_size; ReduceFunctor::applyDevice(dvi); //grid_buffer,m_grid_device_count,m_host_or_device_memory_for_reduced_value,m_local_value,m_identity); #else // printf("Destroy host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this); diff --git a/arcane/src/arcane/accelerator/core/DeviceInfo.h b/arcane/src/arcane/accelerator/core/DeviceInfo.h index 6d71d66b34..51263fdd32 100644 --- a/arcane/src/arcane/accelerator/core/DeviceInfo.h +++ b/arcane/src/arcane/accelerator/core/DeviceInfo.h @@ -1,11 +1,11 @@ // -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*- //----------------------------------------------------------------------------- -// Copyright 2000-2022 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) +// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 //----------------------------------------------------------------------------- /*---------------------------------------------------------------------------*/ -/* DeviceInfo.h (C) 2000-2022 */ +/* DeviceInfo.h (C) 2000-2025 */ /* */ /* Information sur un device. */ /*---------------------------------------------------------------------------*/ @@ -44,12 +44,16 @@ class ARCANE_ACCELERATOR_CORE_EXPORT DeviceInfo //! Description du device. String description() const { return m_description; } + //! Taille d'un warp + Int32 warpSize() const { return m_warp_size; } + public: void setDeviceId(DeviceId id) { m_device_id = id; } void setUUIDAsString(const String& v) { m_uuid_as_string = v; } void setDescription(const String& v) { m_description = v; } void setName(const String& v) { m_name = v; } + void setWarpSize(Int32 v) { m_warp_size = v; } private: @@ -57,6 +61,7 @@ class ARCANE_ACCELERATOR_CORE_EXPORT DeviceInfo String m_name; String m_uuid_as_string; String m_description; + Int32 m_warp_size = 0; }; /*---------------------------------------------------------------------------*/ diff --git a/arcane/src/arcane/accelerator/core/IReduceMemoryImpl.h b/arcane/src/arcane/accelerator/core/IReduceMemoryImpl.h index 8e75ed8893..0a05fc040a 100644 --- a/arcane/src/arcane/accelerator/core/IReduceMemoryImpl.h +++ b/arcane/src/arcane/accelerator/core/IReduceMemoryImpl.h @@ -1,16 +1,16 @@ // -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*- //----------------------------------------------------------------------------- -// Copyright 2000-2023 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) +// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 //----------------------------------------------------------------------------- /*---------------------------------------------------------------------------*/ -/* IReduceMemoryImpl.h (C) 2000-2023 */ +/* IReduceMemoryImpl.h (C) 2000-2025 */ /* */ /* Interface de la gestion mémoire pour les réductions. */ /*---------------------------------------------------------------------------*/ -#ifndef ARCANE_ACCELERATOR_IREDUCEMEMORYIMPL_H -#define ARCANE_ACCELERATOR_IREDUCEMEMORYIMPL_H +#ifndef ARCANE_ACCELERATOR_CORE_IREDUCEMEMORYIMPL_H +#define ARCANE_ACCELERATOR_CORE_IREDUCEMEMORYIMPL_H /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ @@ -18,8 +18,6 @@ #include "arcane/utils/MemoryView.h" -#include - /*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ @@ -40,8 +38,6 @@ class ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl //! Informations mémoire pour la réduction sur les accélérateurs struct GridMemoryInfo { - public: - //! Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T)) MutableMemoryView m_grid_memory_values; //! Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction @@ -50,6 +46,8 @@ class ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl eDeviceReducePolicy m_reduce_policy = eDeviceReducePolicy::Grid; //! Pointeur vers la mémoire sur l'hôte contenant la valeur réduite. void* m_host_memory_for_reduced_value = nullptr; + //! Taille d'un warp + Int32 m_warp_size = 64; }; public: diff --git a/arcane/src/arcane/accelerator/core/ReduceMemoryImpl.cc b/arcane/src/arcane/accelerator/core/ReduceMemoryImpl.cc index 2058d282ed..9606565650 100644 --- a/arcane/src/arcane/accelerator/core/ReduceMemoryImpl.cc +++ b/arcane/src/arcane/accelerator/core/ReduceMemoryImpl.cc @@ -1,11 +1,11 @@ // -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*- //----------------------------------------------------------------------------- -// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) +// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com) // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 //----------------------------------------------------------------------------- /*---------------------------------------------------------------------------*/ -/* ReduceMemoryImpl.cc (C) 2000-2024 */ +/* ReduceMemoryImpl.cc (C) 2000-2025 */ /* */ /* Gestion de la mémoire pour les réductions. */ /*---------------------------------------------------------------------------*/ @@ -49,6 +49,7 @@ ReduceMemoryImpl(RunCommandImpl* p) { _allocateMemoryForReduceData(128); _allocateMemoryForGridDeviceCount(); + m_grid_memory_info.m_warp_size = p->runner()->deviceInfo().warpSize(); } /*---------------------------------------------------------------------------*/ diff --git a/arcane/src/arcane/accelerator/core/Runner.cc b/arcane/src/arcane/accelerator/core/Runner.cc index e2ae0a6b25..f26163a5c1 100644 --- a/arcane/src/arcane/accelerator/core/Runner.cc +++ b/arcane/src/arcane/accelerator/core/Runner.cc @@ -112,6 +112,7 @@ initialize(Runner* runner, eExecutionPolicy v, DeviceId device) m_execution_policy = v; m_device_id = device; m_runtime = _getRuntime(v); + m_device_info = m_runtime->deviceInfoList()->deviceInfo(m_device_id.asInt32()); m_is_init = true; m_is_auto_prefetch_command = false; diff --git a/arcane/src/arcane/accelerator/core/internal/RunnerImpl.h b/arcane/src/arcane/accelerator/core/internal/RunnerImpl.h index e49eb909a3..2a660bf14c 100644 --- a/arcane/src/arcane/accelerator/core/internal/RunnerImpl.h +++ b/arcane/src/arcane/accelerator/core/internal/RunnerImpl.h @@ -16,7 +16,7 @@ #include "arcane/accelerator/core/AcceleratorCoreGlobal.h" -#include "arcane/accelerator/core/DeviceId.h" +#include "arcane/accelerator/core/DeviceInfo.h" #include "arcane/accelerator/core/internal/RunnerInternal.h" #include @@ -125,6 +125,7 @@ class RunnerImpl bool isInit() const { return m_is_init; } eDeviceReducePolicy reducePolicy() const { return m_reduce_policy; } DeviceId deviceId() const { return m_device_id; } + const DeviceInfo& deviceInfo() const { return m_device_info; } public: @@ -141,6 +142,7 @@ class RunnerImpl bool m_is_init = false; const eDeviceReducePolicy m_reduce_policy = eDeviceReducePolicy::Grid; DeviceId m_device_id; + DeviceInfo m_device_info; impl::IRunnerRuntime* m_runtime = nullptr; RunQueueImplStack m_run_queue_pool; std::mutex m_pool_mutex; diff --git a/arcane/src/arcane/accelerator/cuda/runtime/CudaAcceleratorRuntime.cc b/arcane/src/arcane/accelerator/cuda/runtime/CudaAcceleratorRuntime.cc index 946ebf0e69..e78405039b 100644 --- a/arcane/src/arcane/accelerator/cuda/runtime/CudaAcceleratorRuntime.cc +++ b/arcane/src/arcane/accelerator/cuda/runtime/CudaAcceleratorRuntime.cc @@ -603,6 +603,7 @@ fillDevices(bool is_verbose) device_info.setDescription(description); device_info.setDeviceId(DeviceId(i)); device_info.setName(dp.name); + device_info.setWarpSize(dp.warpSize); m_device_info_list.addDevice(device_info); } diff --git a/arcane/src/arcane/accelerator/hip/runtime/HipAcceleratorRuntime.cc b/arcane/src/arcane/accelerator/hip/runtime/HipAcceleratorRuntime.cc index 6c6de44d9c..8716fb1ba5 100644 --- a/arcane/src/arcane/accelerator/hip/runtime/HipAcceleratorRuntime.cc +++ b/arcane/src/arcane/accelerator/hip/runtime/HipAcceleratorRuntime.cc @@ -401,7 +401,24 @@ fillDevices(bool is_verbose) int has_managed_memory = 0; ARCANE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i)); + // Le format des versions dans HIP est: + // HIP_VERSION = (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH) + + int runtime_version = 0; + ARCANE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version)); + //runtime_version /= 10000; + int runtime_major = runtime_version / 10000000; + int runtime_minor = (runtime_version / 100000) % 100; + + int driver_version = 0; + ARCANE_CHECK_HIP(hipDriverGetVersion(&driver_version)); + //driver_version /= 10000; + int driver_major = driver_version / 10000000; + int driver_minor = (driver_version / 100000) % 100; + o << "\nDevice " << i << " name=" << dp.name << "\n"; + o << " Driver version = " << driver_major << "." << (driver_minor) << "." << (driver_version % 100000) << "\n"; + o << " Runtime version = " << runtime_major << "." << (runtime_minor) << "." << (runtime_version % 100000) << "\n"; o << " computeCapability = " << dp.major << "." << dp.minor << "\n"; o << " totalGlobalMem = " << dp.totalGlobalMem << "\n"; o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n"; @@ -450,6 +467,7 @@ fillDevices(bool is_verbose) device_info.setDescription(description); device_info.setDeviceId(DeviceId(i)); device_info.setName(dp.name); + device_info.setWarpSize(dp.warpSize); m_device_info_list.addDevice(device_info); } }