From 4b570a7e05fe3456ddbbb3270a244b3f55225363 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 1 Oct 2019 12:20:23 -0500 Subject: [PATCH 01/20] refactor --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 16 +++++++-------- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 20 +++++++++---------- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 18 ++++++++--------- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 20 +++++++++---------- .../blockwise_generic_tensor_slice_copy.hpp | 16 +++++++-------- ...e_generic_tensor_slice_copy_deprecated.hpp | 16 +++++++-------- .../threadwise_generic_tensor_slice_copy.hpp | 20 +++++++++---------- ...e_generic_tensor_slice_copy_deprecated.hpp | 8 ++++---- .../include/utility/config_amd.hpp.in | 6 +++--- .../include/utility/config_nvidia.hpp.in | 6 +++--- driver/src/driver.cpp | 6 +++--- 11 files changed, 76 insertions(+), 76 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 204b7ab86..ea1412064 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -265,9 +265,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run( - p_in_global, p_in_block_double); - blockwise_wei_copy.template Run( + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run( p_wei_global, p_wei_block_double); } @@ -300,10 +300,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS doubel buffer: load next data from device mem blockwise_in_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); blockwise_wei_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -327,9 +327,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -398,7 +398,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) - .template Run( + .template Run( p_out_thread, p_out_global); } } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index faf876450..a5a753158 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -281,9 +281,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run( - p_in_global, p_in_block_double); - blockwise_wei_copy.template Run( + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run( p_wei_global, p_wei_block_double); } @@ -316,10 +316,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // LDS doubel buffer: load next data from device mem blockwise_in_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); blockwise_wei_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -343,9 +343,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -427,12 +427,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global, 0}) #if 1 - .template Run + .template Run #else // tweaking .template Run_optimized_dst_address_calculation + AddressSpace_t::generic, + AddressSpace_t::global> #endif (p_out_thread, p_out_global); } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index bc9a7c8be..55ca61926 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -251,10 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -285,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -311,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -391,7 +391,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { threadwise_out_copy - .template Run( + .template Run( p_out_thread, p_out_global); threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index bee553f62..d39e11de2 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -255,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run( - p_in_global, p_in_block_double); - blockwise_wei_copy.template Run( + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run( p_wei_global, p_wei_block_double); } @@ -290,10 +290,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // LDS doubel buffer: load next data from device mem blockwise_in_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); blockwise_wei_copy - .template RunLoadThreadBuffer( + .template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -317,9 +317,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -390,12 +390,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global / B1, b_thread_data_on_global % B1}) #if 1 - .template Run + .template Run #else // tweaking .template Run_optimized_dst_address_calculation + AddressSpace_t::generic, + AddressSpace_t::global> #endif (p_out_thread, p_out_global); } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 5e4ee81d2..69e98d4c8 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -68,8 +68,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic> __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, ThreadBufferData* p_thread_buffer) const { @@ -89,8 +89,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic, + AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic> __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, BlockDstData* p_block_dst) const { @@ -110,8 +110,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic> __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const { BlockSrcData p_thread_buffer[GetThreadBufferSize()]; @@ -119,12 +119,12 @@ struct BlockwiseGenericTensorSliceCopy_v4 RunLoadThreadBuffer(p_block_src, p_thread_buffer); + AddressSpace_t::generic>(p_block_src, p_thread_buffer); // if there is type conversion, it's done during store RunStoreThreadBuffer(p_thread_buffer, p_block_dst); } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index 9776b5413..881a88771 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -480,8 +480,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic> __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const { mThreadwiseLoad @@ -491,8 +491,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic, + AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic> __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const { mThreadwiseStore @@ -502,17 +502,17 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic> __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const { SrcData p_thread_buffer[GetThreadBufferSize()]; - RunLoadThreadBuffer( + RunLoadThreadBuffer( p_block_src, p_thread_buffer); // if there is type conversion, it's done during store - RunStoreThreadBuffer( + RunStoreThreadBuffer( p_thread_buffer, p_block_dst); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index c79089d31..0bd147d1b 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -76,8 +76,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on dst data: No write if dst data is in paddin area. template + AddressSpace_t SrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t DstAddressSpace = AddressSpace_t::generic> __device__ void Run(const SrcData* p_src, DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; @@ -126,7 +126,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( @@ -167,7 +167,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), @@ -204,8 +204,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // This version is optimized for address calculation of src tensor template + AddressSpace_t SrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t DstAddressSpace = AddressSpace_t::generic> __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, DstData* p_dst) const { @@ -302,7 +302,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( @@ -362,8 +362,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // This version is optimized for address calculation of dst tensor template + AddressSpace_t SrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t DstAddressSpace = AddressSpace_t::generic> __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, DstData* p_dst) const { @@ -491,7 +491,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the dst vector has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index c271c6553..78684abe9 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -539,8 +539,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 template + AddressSpace_t SrcAddressSpace = AddressSpace_t::generic, + AddressSpace_t DstAddressSpace = AddressSpace_t::generic> __device__ void Run(const SrcData* p_src, DstData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -613,7 +613,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. src_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. src_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE vector_data = __buffer_load( p_src, src_merged_offset, src_normal_offset); @@ -722,7 +722,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. dst_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. dst_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 437ed3ee8..e603ffcf0 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -16,10 +16,10 @@ namespace ck { -enum address_space_t +enum AddressSpace_t { - generic = 0, - global = 3 + generic, + global }; #if CK_UNSIGNED_INDEX_TYPE diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 9afce0298..67cd93136 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -18,10 +18,10 @@ namespace ck { -enum address_space_t +enum AddressSpace_t { - generic = 0, - global = generic + generic, + global = generic }; #if CK_UNSIGNED_INDEX_TYPE diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index ab5b8826a..4319c4f7d 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -438,7 +438,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, From 6559b0c0ec41ad90fbb0e6358e7a133ff3b8f630 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 2 Oct 2019 23:47:17 -0500 Subject: [PATCH 02/20] refactored fp32 blockwise and threadwise gemm --- .../ConstantMatrixDescriptor.hpp | 5 + .../tensor_operation/blockwise_gemm.hpp | 406 +++++------ .../tensor_operation/threadwise_gemm.hpp | 181 +++-- .../include/utility/amd_inline_asm.hpp | 678 +----------------- .../include/utility/bfloat16_dev.hpp | 125 ++++ .../include/utility/config_amd.hpp.in | 56 +- composable_kernel/include/utility/math.hpp | 62 ++ 7 files changed, 555 insertions(+), 958 deletions(-) create mode 100644 composable_kernel/include/utility/bfloat16_dev.hpp diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index f2f842e11..ada40e8ba 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -32,6 +32,11 @@ struct ConstantMatrixDescriptor return irow * RowStride_ + icol; } + __host__ __device__ static index_t CalculateOffset(index_t irow, index_t icol) + { + return GetOffsetFromMultiIndex(irow, icol); + } + template __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number, Number) diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index 819ecf0c4..71245a7a9 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -5,19 +5,15 @@ #include "ConstantMatrixDescriptor.hpp" #include "threadwise_gemm.hpp" -#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM -#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1 -#endif - namespace ck { // if following number are power of 2, index calculation shall be greatly reduced: // MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster, // MLevel1ThreadCluster, NLevel1ThreadCluster template - __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const + template + __device__ void + Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const { + constexpr auto True = integral_constant{}; + constexpr auto False = integral_constant{}; + constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; - constexpr index_t M = a_block_mtx.NCol(); - constexpr index_t N = b_block_mtx.NCol(); constexpr index_t K = a_block_mtx.NRow(); constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t NPerThread = c_thread_mtx.NCol(); - // thread A, B for GEMM - constexpr auto a_thread_mtx = - make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - - constexpr auto b_thread_mtx = - make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; - FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster; constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster; - // assertion for inline asm - static_assert(is_same{} && is_same{} && - is_same{}, - "Run_amd_asm only deal with float"); - - static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 && - MPerThread == 8 && NPerThread == 8, - "Run_amd_asm cannot deal with this GEMM shape yet"); - - static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read"); - - using Float4 = vector_type::MemoryType; - - Float4* reg_a = reinterpret_cast(p_a_thread); - Float4* reg_b = reinterpret_cast(p_b_thread); - Float4* reg_c = reinterpret_cast(p_c_thread); - - reg_a[0] = *reinterpret_cast(&p_a_block[mMyThreadOffsetA]); - reg_b[0] = *reinterpret_cast(&p_b_block[mMyThreadOffsetB]); - reg_b[1] = - *reinterpret_cast(&p_b_block[mMyThreadOffsetB + NPerLevel1Cluster]); - reg_a[1] = - *reinterpret_cast(&p_a_block[mMyThreadOffsetA + MPerLevel1Cluster]); - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); -#pragma unroll - for(index_t k = 1; k < K; ++k) - { - reg_a[0] = *reinterpret_cast(&p_a_block[mMyThreadOffsetA + k * M]); - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - reg_b[0] = *reinterpret_cast(&p_b_block[mMyThreadOffsetB + k * N]); - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - reg_b[1] = *reinterpret_cast( - &p_b_block[mMyThreadOffsetB + k * N + NPerLevel1Cluster]); - reg_a[1] = *reinterpret_cast( - &p_a_block[mMyThreadOffsetA + k * M + MPerLevel1Cluster]); - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); - } - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - } - - __device__ void Run_amd_asm_v2(const float* __restrict__ p_a_block, - const float* __restrict__ p_b_block, - float* __restrict__ p_c_thread) const - { - constexpr auto a_block_mtx = BlockMatrixA{}; - constexpr auto b_block_mtx = BlockMatrixB{}; - constexpr auto c_thread_mtx = ThreadMatrixC{}; - - constexpr index_t M = a_block_mtx.NCol(); - constexpr index_t N = b_block_mtx.NCol(); - constexpr index_t K = a_block_mtx.NRow(); - - constexpr index_t MPerThread = c_thread_mtx.NRow(); - constexpr index_t NPerThread = c_thread_mtx.NCol(); + constexpr index_t MRepeat = MPerThread / MPerThreadSubC; + constexpr index_t NRepeat = NPerThread / NPerThreadSubC; // thread A, B for GEMM constexpr auto a_thread_mtx = @@ -214,110 +144,65 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_thread_mtx = make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - float p_a_thread[a_thread_mtx.GetElementSpace()]; - float p_b_thread[b_thread_mtx.GetElementSpace()]; - - constexpr index_t MThreadCluster = MLevel0ThreadCluster * MLevel1ThreadCluster; - constexpr index_t NThreadCluster = NLevel0ThreadCluster * NLevel1ThreadCluster; - - constexpr index_t MDataCluster = M / MPerThreadSubC; - constexpr index_t NDataCluster = N / NPerThreadSubC; - - constexpr index_t MRepeat = MDataCluster / MThreadCluster; - constexpr index_t NRepeat = NDataCluster / NThreadCluster; + // thread A-sub, B-sub for copy + constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); - // assertion for inline asm - static_assert((MPerThreadSubC == 4 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 && - KPerThreadLoop == 1) || - (MPerThreadSubC == 2 && NPerThreadSubC == 4 && MRepeat == 2 && - NRepeat == 2 && KPerThreadLoop == 1), - "Run_amd_asm cannot deal with this GEMM shape yet"); + constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); - static_assert(DataPerReadA == MPerThreadSubC && DataPerReadB == NPerThreadSubC, - "wrong! Run_amd_asm doesn't support this config"); + FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; + FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - if(MPerThreadSubC == 4 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 && - KPerThreadLoop == 1) + constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy{}; + + constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy{}; + + constexpr auto threadwise_gemm = + ThreadwiseGemmTransANormalBNormalC{}; +#pragma unroll + // loop over k + for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop) { - using float4_type = vector_type::MemoryType; - - float4_type* reg_a = reinterpret_cast(p_a_thread); - float4_type* reg_b = reinterpret_cast(p_b_thread); - float4_type* reg_c = reinterpret_cast(p_c_thread); - - const float4_type* p_a = - reinterpret_cast(&p_a_block[mMyThreadOffsetA]); - const float4_type* p_b = - reinterpret_cast(&p_b_block[mMyThreadOffsetB]); - - reg_a[0] = p_a[0]; - reg_b[0] = p_b[0]; - reg_b[1] = p_b[NThreadCluster]; - reg_a[1] = p_a[MThreadCluster]; - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); #pragma unroll - for(index_t k = 1; k < K; ++k) + // read A + for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { - reg_a[0] = p_a[k * MDataCluster]; - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - reg_b[0] = p_b[k * NDataCluster]; - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - reg_b[1] = p_b[k * NDataCluster + NThreadCluster]; - reg_a[1] = p_a[k * MDataCluster + MThreadCluster]; - outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); - outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); + a_thread_copy.Run( + p_a_block + a_block_mtx.CalculateOffset(k_begin, m_repeat * MPerLevel1Cluster) + + mMyThreadOffsetA, + p_a_thread + a_thread_mtx.CalculateOffset(0, m_repeat * MPerThreadSubC)); } - outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); - outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); - } - else if(MPerThreadSubC == 2 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 && - KPerThreadLoop == 1) - { - using float2_type = vector_type::MemoryType; - using float4_type = vector_type::MemoryType; - - float2_type* reg_a = reinterpret_cast(p_a_thread); - float4_type* reg_b = reinterpret_cast(p_b_thread); - float4_type* reg_c = reinterpret_cast(p_c_thread); - - const float2_type* p_a = - reinterpret_cast(&p_a_block[mMyThreadOffsetA]); - const float4_type* p_b = - reinterpret_cast(&p_b_block[mMyThreadOffsetB]); - - reg_a[0] = p_a[0]; - reg_b[0] = p_b[0]; - reg_b[1] = p_b[NThreadCluster]; - reg_a[1] = p_a[MThreadCluster]; - outerProduct2x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2]); - outerProduct2x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3]); + #pragma unroll - for(index_t k = 1; k < K; ++k) + // read B + for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { - reg_a[0] = p_a[k * MDataCluster]; - outerProduct2x4(reg_a[1], reg_b[0], reg_c[4], reg_c[6]); - reg_b[0] = p_b[k * NDataCluster]; - outerProduct2x4(reg_a[1], reg_b[1], reg_c[5], reg_c[7]); - reg_b[1] = p_b[k * NDataCluster + NThreadCluster]; - reg_a[1] = p_a[k * MDataCluster + MThreadCluster]; - outerProduct2x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2]); - outerProduct2x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3]); + b_thread_copy.Run( + p_b_block + b_block_mtx.CalculateOffset(k_begin, n_repeat * NPerLevel1Cluster) + + mMyThreadOffsetB, + p_b_thread + b_thread_mtx.CalculateOffset(0, n_repeat * NPerThreadSubC)); } - outerProduct2x4(reg_a[1], reg_b[0], reg_c[4], reg_c[6]); - outerProduct2x4(reg_a[1], reg_b[1], reg_c[5], reg_c[7]); + + // C += A * B + threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread); } } -#endif - template - __device__ void Run_source(const FloatA* const __restrict__ p_a_block, - const FloatB* const __restrict__ p_b_block, - FloatC* const __restrict__ p_c_thread) const + template + __device__ void + Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const { - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; - constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; @@ -327,88 +212,143 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr index_t MPerThread = c_thread_mtx.NRow(); constexpr index_t NPerThread = c_thread_mtx.NCol(); - // thread A, B for GEMM + constexpr index_t MPerLevel1Cluster = + MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster; + constexpr index_t NPerLevel1Cluster = + NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster; + + constexpr index_t MRepeat = MPerThread / MPerThreadSubC; + constexpr index_t NRepeat = NPerThread / NPerThreadSubC; + + static_assert(MRepeat == 2 && NRepeat == 2, + "wrong! inline asm cannot deal with this GEMM config yet"); + + // thread A, B constexpr auto a_thread_mtx = make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - constexpr auto b_thread_mtx = make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - // thread A-sub, B-sub for copy - constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + // thread A-sub, B-sub + constexpr auto a_thread_sub_mtx = a_thread_mtx.MakeSubMatrixDescriptor( + Number{}, Number{}); + constexpr auto b_thread_sub_mtx = b_thread_mtx.MakeSubMatrixDescriptor( + Number{}, Number{}); - constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + // thread C-sub + constexpr auto c_thread_sub_mtx = ThreadMatrixC::MakeSubMatrixDescriptor( + Number{}, Number{}); FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; - constexpr index_t MPerLevel1Cluster = - MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster; - constexpr index_t NPerLevel1Cluster = - NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster; + constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy{}; - constexpr index_t MRepeat = MPerThread / MPerThreadSubC; - constexpr index_t NRepeat = NPerThread / NPerThreadSubC; + constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy{}; -#pragma unroll - // loop over k - for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop) - { -#pragma unroll - // copy A-sub to form A - for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) - { - threadwise_matrix_copy( - a_block_mtx, - p_a_block + - a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) + - mMyThreadOffsetA, - a_thread_mtx, - p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC), - a_thread_sub_mtx.GetLengths(), - Number{}); - } + constexpr auto threadwise_gemm = + ThreadwiseGemmTransANormalBNormalC{}; + + const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA; + const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB; + + // read A_sub_0 + a_thread_copy.Run(p_a_block_off, p_a_thread); + + // read B_sub_0 + b_thread_copy.Run(p_b_block_off, p_b_thread); + + // read B_sub_1 + b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(0, NPerLevel1Cluster), + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC)); + + // read A_sub_1 + a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(0, MPerLevel1Cluster), + p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC)); + + // C_sub_00 += transpose(A_sub_0) * B_sub_0 + threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread); + + // C_sub_01 += transpose(A_sub_0) * B_sub_1 + threadwise_gemm.Run(p_a_thread, + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC), + p_c_thread + ThreadMatrixC::CalculateOffset(0, NPerThreadSubC)); #pragma unroll - // copy B-sub to form B - for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat) - { - threadwise_matrix_copy( - b_block_mtx, - p_b_block + - b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) + - mMyThreadOffsetB, - b_thread_mtx, - p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC), - b_thread_sub_mtx.GetLengths(), - Number{}); - } + // loop over rest of k + for(index_t k = KPerThreadLoop; k < K; k += KPerThreadLoop) + { + // read A_sub_0 + a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(k, 0), p_a_thread); + + // C_sub_10 += transpose(A_sub_1) * B_sub_0 + threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC), + p_b_thread, + p_c_thread + ThreadMatrixC::CalculateOffset(MPerThreadSubC, 0)); + + // read B_sub_0 + b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(k, 0), p_b_thread); + + // C_sub_11 += transpose(A_sub_1) * B_sub_1 + threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC), + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC), + p_c_thread + + ThreadMatrixC::CalculateOffset(MPerThreadSubC, NPerThreadSubC)); + + // read B_sub_1 + b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(k, NPerLevel1Cluster), + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC)); + + // read A_sub_1 + a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(k, MPerLevel1Cluster), + p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC)); + + // C_sub_00 += transpose(A_sub_0) * B_sub_0 + threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread); + + // C_sub_01 += transpose(A_sub_0) * B_sub_1 + threadwise_gemm.Run(p_a_thread, + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC), + p_c_thread + ThreadMatrixC::CalculateOffset(0, NPerThreadSubC)); + } - // C = A * B - threadwise_gemm(a_thread_mtx, - True, - p_a_thread, - b_thread_mtx, - False, + // C_sub_10 += transpose(A_sub_1) * B_sub_0 + threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC), p_b_thread, - c_thread_mtx, - False, - p_c_thread); - } - } + p_c_thread + ThreadMatrixC::CalculateOffset(MPerThreadSubC, 0)); - template - __device__ void Run(const FloatA* __restrict__ p_a_block, - const FloatB* __restrict__ p_b_block, - FloatC* __restrict__ p_c_thread) const + // C_sub_11 += transpose(A_sub_1) * B_sub_1 + threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC), + p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC), + p_c_thread + + ThreadMatrixC::CalculateOffset(MPerThreadSubC, NPerThreadSubC)); + } + template + __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const { -#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM - Run_amd_asm_v2(p_a_block, p_b_block, p_c_thread); +#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE + constexpr index_t MPerThread = ThreadMatrixC::NRow(); + constexpr index_t NPerThread = ThreadMatrixC::NCol(); + + constexpr index_t MRepeat = MPerThread / MPerThreadSubC; + constexpr index_t NRepeat = NPerThread / NPerThreadSubC; + + static_if{}([&](auto) { + Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread); + }).Else([&](auto) { Run_naive(p_a_block, p_b_block, p_c_thread); }); #else - Run_source(p_a_block, p_b_block, p_c_thread); + Run_naive(p_a_block, p_b_block, p_c_thread); #endif } }; diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index fb1540a98..7fe069a88 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -3,102 +3,157 @@ #include "common_header.hpp" #include "ConstantMatrixDescriptor.hpp" +#include "math.hpp" namespace ck { -template +template __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) { for(index_t i = 0; i < Matrix::NRow(); ++i) { for(index_t j = 0; j < Matrix::NCol(); ++j) { - const index_t id = Matrix::GetOffsetFromMultiIndex(i, j); + const index_t id = Matrix::CalculateOffset(i, j); p_thread[id] = Float(0); } } } -template -__device__ void threadwise_matrix_copy(SrcMatrix, - const Float* __restrict__ p_src, - DstMatrix, - Float* __restrict__ p_dst, - Sequence, - Number) +template +struct ThreadwiseMatrixSliceCopy { - static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0"); - - using vector_t = typename vector_type::MemoryType; - - constexpr auto src_mtx = SrcMatrix{}; - constexpr auto dst_mtx = DstMatrix{}; + __device__ constexpr ThreadwiseMatrixSliceCopy() + { + static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 && + DstMatrix::RowStride() % DataPerAccess == 0, + "wrong! wrong alignment"); + static_assert(NSliceCol % DataPerAccess == 0, + "wrong! should be NSliceCol % DataPerAccess == 0"); + } - for(index_t i = 0; i < NRow; ++i) + template + __device__ static void Run(const Data* p_src, Data* p_dst) { - for(index_t j = 0; j < NCol; j += DataPerRead) + using vector_t = typename vector_type::MemoryType; + + for(index_t i = 0; i < NSliceRow; ++i) { - const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j); - const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j); + for(index_t j = 0; j < NSliceCol; j += DataPerAccess) + { + const index_t src_index = SrcMatrix::CalculateOffset(i, j); + const index_t dst_index = DstMatrix::CalculateOffset(i, j); - *reinterpret_cast(&p_dst[dst_index]) = - *reinterpret_cast(&p_src[src_index]); + *reinterpret_cast(&p_dst[dst_index]) = + *reinterpret_cast(&p_src[src_index]); + } } } -} +}; -template -__device__ void threadwise_gemm(MatrixA, - integral_constant, - const FloatA* __restrict__ p_a_thread, - MatrixB, - integral_constant, - const FloatB* __restrict__ p_b_thread, - MatrixC, - integral_constant, - FloatC* __restrict__ p_c_thread) +// C += transpose(A) * B +// Element of matrix can be vectorized data +template +struct ThreadwiseGemmTransANormalBNormalC { - static_if{}([&](auto) { - constexpr auto a_mtx = MatrixA{}; - constexpr auto b_mtx = MatrixB{}; - constexpr auto c_mtx = MatrixC{}; + __device__ constexpr ThreadwiseGemmTransANormalBNormalC() + { + static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() && + MatrixB::NCol() == MatrixC::NCol(), + "wrong!"); + } - constexpr index_t M = c_mtx.NRow(); - constexpr index_t N = c_mtx.NCol(); - constexpr index_t K = a_mtx.NRow(); // A is transposed + template + __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) + { + constexpr index_t M = MatrixC::NRow(); + constexpr index_t N = MatrixC::NCol(); + constexpr index_t K = MatrixA::NRow(); // A is transposed for(index_t k = 0; k < K; ++k) { - for(index_t i = 0; i < M; ++i) + for(index_t m = 0; m < M; ++m) { - for(index_t j = 0; j < N; ++j) + for(index_t n = 0; n < N; ++n) { - const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed - const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j); - const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j); + const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed + const index_t bindex = MatrixB::CalculateOffset(k, n); + const index_t cindex = MatrixC::CalculateOffset(m, n); - p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex]; + p_c[cindex] += + math::inner_product_with_conversion{}(p_a[aindex], p_b[bindex]); } } } - }).Else([&](auto fwd) { - // not implemented - static_assert(fwd(false), "wrong! support for this config is not implemented"); - }); -} + } + +#if CK_USE_AMD_INLINE_ASM + template + __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) + { + constexpr index_t M = MatrixC::NRow(); + constexpr index_t N = MatrixC::NCol(); + constexpr index_t K = MatrixA::NRow(); // A is transposed + + static_assert(N == 4 || N == 2, "wrong! not supported by asm yet"); + + for(index_t k = 0; k < K; ++k) + { + for(index_t m = 0; m < M; ++m) + { + const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed + + static_if{}([&](auto) { + const index_t bindex_0 = MatrixB::CalculateOffset(k, 0); + const index_t bindex_1 = MatrixB::CalculateOffset(k, 1); + + const index_t cindex_0 = MatrixC::CalculateOffset(m, 0); + const index_t cindex_1 = MatrixC::CalculateOffset(m, 1); + + __outer_product_1x2( + p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]); + }); + + static_if{}([&](auto) { + const index_t bindex_0 = MatrixB::CalculateOffset(k, 0); + const index_t bindex_1 = MatrixB::CalculateOffset(k, 1); + const index_t bindex_2 = MatrixB::CalculateOffset(k, 2); + const index_t bindex_3 = MatrixB::CalculateOffset(k, 3); + + const index_t cindex_0 = MatrixC::CalculateOffset(m, 0); + const index_t cindex_1 = MatrixC::CalculateOffset(m, 1); + const index_t cindex_2 = MatrixC::CalculateOffset(m, 2); + const index_t cindex_3 = MatrixC::CalculateOffset(m, 3); + + __outer_product_1x4(p_a[aindex], + p_b[bindex_0], + p_b[bindex_1], + p_b[bindex_2], + p_b[bindex_3], + p_c[cindex_0], + p_c[cindex_1], + p_c[cindex_2], + p_c[cindex_3]); + }); + } + } + } +#endif + + template + __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) + { +#if CK_USE_AMD_INLINE_ASM && CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM + Run_amd_asm(p_a, p_b, p_c); +#else + Run_source(p_a, p_b, p_c); +#endif + } +}; } // namespace ck #endif diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 0a17b4bd3..2d175852e 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -3,82 +3,32 @@ #include "vector_type.hpp" +// disable inline asm due to the compiler issue: SWDEV-202749 +#define WORKAROUND_SWDEV_202749 1 + namespace ck { // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); -__device__ void vmcnt(index_t cnt) -{ - if(cnt == 0) - { - asm volatile("\n \ - s_waitcnt vmcnt(0) \n \ - " ::); - } - else if(cnt == 1) - { - asm volatile("\n \ - s_waitcnt vmcnt(1) \n \ - " ::); - } - else if(cnt == 2) - { - asm volatile("\n \ - s_waitcnt vmcnt(2) \n \ - " ::); - } - else if(cnt == 4) - { - asm volatile("\n \ - s_waitcnt vmcnt(2) \n \ - " ::); - } - else - { - assert(false); - } -} - -__device__ void lgkmcnt(index_t cnt) +__device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) { - if(cnt == 0) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(0) \n \ - " ::); - } - else if(cnt == 1) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(1) \n \ - " ::); - } - else if(cnt == 2) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(2) \n \ - " ::); - } - else if(cnt == 3) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(3) \n \ - " ::); - } - else if(cnt == 4) - { - asm volatile("\n \ - s_waitcnt lgkmcnt(4) \n \ - " ::); - } - else - { - assert(false); - } +///\to-do: enable the inline asm after the compiler fix +#if WORKAROUND_SWDEV_202749 + c0 += a * b0; + c1 += a * b1; +#else + asm volatile("\n \ + v_mac_f32 %0, %2, %3 \n \ + v_mac_f32 %1, %2, %4 \n \ + " + : "=v"(c0), "=v"(c1) + : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1)); +#endif } -__device__ void outerProduct1x4(const float* a, const float* b, float* c) +__device__ void __outer_product_1x4( + float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3) { asm volatile("\n \ v_mac_f32 %0, %4, %5 \n \ @@ -86,596 +36,8 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c) v_mac_f32 %2, %4, %7 \n \ v_mac_f32 %3, %4, %8 \n \ " - : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3]) - : "v"(a[0]), - "v"(b[0]), - "v"(b[1]), - "v"(b[2]), - "v"(b[3]), - "0"(c[0]), - "1"(c[1]), - "2"(c[2]), - "3"(c[3])); -} - -__device__ void outerProduct1x4(const float& a, - const vector_type::MemoryType& b, - vector_type::MemoryType& c) -{ - outerProduct1x4(&a, reinterpret_cast(&b), reinterpret_cast(&c)); -} - -__device__ void outerProduct2x4(const vector_type::MemoryType& a, - const vector_type::MemoryType& b, - vector_type::MemoryType& c0, - vector_type::MemoryType& c1) -{ - outerProduct1x4(a.x, b, c0); - outerProduct1x4(a.y, b, c1); -} - -__device__ void outerProduct4x4(const vector_type::MemoryType& a, - const vector_type::MemoryType& b, - vector_type::MemoryType& c0, - vector_type::MemoryType& c1, - vector_type::MemoryType& c2, - vector_type::MemoryType& c3) -{ - outerProduct1x4(a.x, b, c0); - outerProduct1x4(a.y, b, c1); - outerProduct1x4(a.z, b, c2); - outerProduct1x4(a.w, b, c3); -} - -__device__ void outerProduct8x8(const vector_type::MemoryType* a, - const vector_type::MemoryType* b, - vector_type::MemoryType* c) -{ - outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]); - outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]); - outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]); - outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]); -} - -__device__ void ds_read_b128(vector_type::MemoryType& r, void* lds, index_t offset = 0) -{ - if(offset == 0) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:0\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 64) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:64\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 128) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:128\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 192) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:192\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 256) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:256\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 320) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:320\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 384) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:384\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 448) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:448\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 512) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:512\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 576) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:576\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 640) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:640\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 704) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:704\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 768) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:768\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 832) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:832\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 896) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:896\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 960) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:960\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1024) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1024\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1088) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1088\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1152) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1152\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1216) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1216\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1280) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1280\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1344) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1344\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1408) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1408\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1472) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1472\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1536) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1536\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1600) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1600\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1664) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1664\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1728) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1728\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1792) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1792\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1856) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1856\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1920) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1920\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 1984) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:1984\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2048) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2048\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2112) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2112\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2176) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2176\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2240) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2240\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2304) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2304\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2368) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2368\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2432) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2432\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2496) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2496\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2560) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2560\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2624) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2624\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2688) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2688\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2752) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2752\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2816) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2816\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2880) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2880\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 2944) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:2944\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3008) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3008\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3072) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3072\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3136) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3136\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3200) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3200\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3264) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3264\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3328) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3328\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3392) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3392\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3456) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3456\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3520) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3520\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3584) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3584\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3648) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3648\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3712) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3712\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3776) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3776\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3840) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3840\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3904) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3904\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 3968) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:3968\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 4032) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:4032\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } - if(offset == 4096) - { - asm volatile("\n \ - ds_read_b128 %0, %1 offset:4096\n \ - " - : "=v"(r) - : "v"(__to_local(lds))); - } -} - -__device__ void -ds_write_b128(const vector_type::MemoryType& r, void* lds, index_t offset = 0) -{ - if(offset == 0) - { - asm volatile("\n \ - ds_write_b128 %0, %1 \n \ - " - : - : "v"(__to_local(lds)), "v"(r)); - } - else - { - assert(false); - } + : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) + : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3)); } } // namespace ck diff --git a/composable_kernel/include/utility/bfloat16_dev.hpp b/composable_kernel/include/utility/bfloat16_dev.hpp new file mode 100644 index 000000000..52d00346c --- /dev/null +++ b/composable_kernel/include/utility/bfloat16_dev.hpp @@ -0,0 +1,125 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef BFLOAT16_DEVICE_HPP +#define BFLOAT16_DEVICE_HPP + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __HIP_PLATFORM_HCC__ +#define EXECUTION_SPECIFIER __device__ +#else +#define EXECUTION_SPECIFIER +#endif // MIOPEN_BACKEND_HIP + +typedef union +{ + uint u32; + ushort2 ushortx2; + +// Composable kernels are written in HIP language. The language doesnt support +// ushort2.hi or ushort2.low. +#ifdef __HIP_PLATFORM_HCC__ + ushort ushortvec[2]; +#endif // MIOPEN_BACKEND_HIP + float f32; +} cvt_bf16_fp32_t; + +EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val) +{ + cvt_bf16_fp32_t target_val; + +#ifdef __HIP_PLATFORM_HCC__ + target_val.ushortx2 = make_ushort2(0, src_val); +#else + target_val.ushortx2 = (ushort2)(0, src_val); +#endif + + return target_val.f32; +} + +EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val) +{ + cvt_bf16_fp32_t target_val; + target_val.f32 = src_val; + // BF16 round and NaN preservation code matches + // https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h + if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN + { + // When all of the exponent bits are 1, the value is Inf or NaN. + // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero + // mantissa bit. Quiet NaN is indicated by the most significant mantissa + // bit being 1. Signaling NaN is indicated by the most significant + // mantissa bit being 0 but some other bit(s) being 1. If any of the + // lower 16 bits of the mantissa are 1, we set the least significant bit + // of the bfloat16 mantissa, in order to preserve signaling NaN in case + // the bloat16's mantissa bits are all 0. + if((target_val.u32 & 0xffff) != 0) + { + target_val.u32 |= 0x10000; // Preserve signaling NaN + } + } + else + { +#ifdef MIOPEN_USE_RNE_BFLOAT16 +// When the exponent bits are not all 1s, then the value is zero, normal, +// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus +// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). +// This causes the bfloat16's mantissa to be incremented by 1 if the 16 +// least significant bits of the float mantissa are greater than 0x8000, +// or if they are equal to 0x8000 and the least significant bit of the +// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when +// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already +// has the value 0x7f, then incrementing it causes it to become 0x00 and +// the exponent is incremented by one, which is the next higher FP value +// to the unrounded bfloat16 value. When the bfloat16 value is subnormal +// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up +// to a normal value with an exponent of 0x01 and a mantissa of 0x00. +// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, +// incrementing it causes it to become an exponent of 0xFF and a mantissa +// of 0x00, which is Inf, the next higher value to the unrounded value. +#ifdef __HIP_PLATFORM_HCC__ + target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1)); +#else + target_val.u32 += + (0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even +#endif // MIOPEN_BACKEND_HIP +#endif // MIOPEN_USE_RNE_BFLOAT16 + } + +#ifdef __HIP_PLATFORM_HCC__ + return target_val.ushortvec[1]; +#else + return target_val.ushortx2.hi; +#endif // MIOPEN_BACKEND_HIP +} + +#ifdef __cplusplus +} +#endif + +#endif // BFLOAT16_DEVICE_HPP diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index e603ffcf0..971d1b35b 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -3,12 +3,24 @@ #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" +#include "bfloat16_dev.hpp" +// index type: unsigned or signed #define CK_UNSIGNED_INDEX_TYPE 0 + +// device backend #define CK_DEVICE_BACKEND_AMD 1 -#define CK_USE_AMD_INTRINSIC 1 + +// AMD inline asm #define CK_USE_AMD_INLINE_ASM 1 +#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 + +// AMD intrinsic +#define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 + +// experimental implementation +#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -32,20 +44,56 @@ using index_t = int32_t; // instruction typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float4_t __attribute__((ext_vector_type(4))); +typedef float float32_t __attribute__((ext_vector_type(32))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +// half +typedef half2 half2_t; + +typedef struct +{ + // TODO: why not use "half scalar[4]"? + half2_t scalar[2]; +} half4_t; + +// bfloat16: use ushort +typedef struct +{ + ushort scalar[2]; +} ushort2_t; + +typedef struct +{ + // TODO: why not use "ushort scalar[4]"? + ushort2_t scalar[2]; +} ushort4_t; + // data type conversion -template +template struct type_convert { - template - __device__ T operator()(const X& x) const + template + __device__ T operator()(X x) const { return static_cast(x); } }; +template <> +template <> +__device__ float type_convert::operator()(ushort x) const +{ + return bfloat16_to_float(x); +} + +template <> +template <> +__device__ ushort type_convert::operator()(float x) const +{ + return float_to_bfloat16(x); +} + } // namespace ck #endif diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp index ba70e7ab2..f6c41cc52 100644 --- a/composable_kernel/include/utility/math.hpp +++ b/composable_kernel/include/utility/math.hpp @@ -117,6 +117,68 @@ struct less __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; } }; +template +struct inner_product_with_conversion +{ + static constexpr auto convert = type_convert(); + + __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } + + __device__ T operator()(half2_t a, half2_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + + return acc; + } + + __device__ T operator()(half4_t a, half4_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + return acc; + } + + __device__ T operator()(ushort2_t a, ushort2_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + + return acc; + } + + __device__ T operator()(ushort4_t a, ushort4_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + return acc; + } +}; + } // namespace math } // namspace ck From af1cb272cfb76e075a9ca2d39cda26fb1b936869 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 17:53:41 -0500 Subject: [PATCH 03/20] clean up --- .../include/tensor_operation/blockwise_gemm.hpp | 7 +++---- .../include/tensor_operation/threadwise_gemm.hpp | 2 +- composable_kernel/include/utility/config_amd.hpp.in | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index 71245a7a9..cd04c4550 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -7,7 +7,9 @@ namespace ck { -// if following number are power of 2, index calculation shall be greatly reduced: +// blockwise GEMM: C += transpose(A) * B +// A and B are visable to the whole block, C is distributed among each thread +// If following number are power of 2, index calculation shall be greatly reduced: // MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster, // MLevel1ThreadCluster, NLevel1ThreadCluster template {}; - constexpr auto False = integral_constant{}; - constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto b_block_mtx = BlockMatrixB{}; constexpr auto c_thread_mtx = ThreadMatrixC{}; diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index 7fe069a88..503eb9522 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -99,7 +99,7 @@ struct ThreadwiseGemmTransANormalBNormalC constexpr index_t N = MatrixC::NCol(); constexpr index_t K = MatrixA::NRow(); // A is transposed - static_assert(N == 4 || N == 2, "wrong! not supported by asm yet"); + static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet"); for(index_t k = 0; k < K; ++k) { diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 971d1b35b..799d5f8a9 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -41,7 +41,8 @@ using index_t = int32_t; #endif // For some reason, HIP compiler need this definition to generate optimal load and store -// instruction +// instruction +// float typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float4_t __attribute__((ext_vector_type(4))); typedef float float32_t __attribute__((ext_vector_type(32))); From a0806d0e6b43a24e4847f976ffb5492766980628 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 18:41:41 -0500 Subject: [PATCH 04/20] miopen integration --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 24 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 28 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 20 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 28 +- .../tensor_operation/blockwise_gemm.hpp | 7 - .../blockwise_generic_tensor_slice_copy.hpp | 16 +- ...e_generic_tensor_slice_copy_deprecated.hpp | 20 +- .../tensor_operation/threadwise_gemm.hpp | 13 +- .../threadwise_generic_tensor_slice_copy.hpp | 32 +-- ...e_generic_tensor_slice_copy_deprecated.hpp | 261 +----------------- .../include/utility/amd_inline_asm.hpp | 96 ++++++- .../include/utility/config_amd.hpp.in | 64 +++-- .../include/utility/config_nvidia.hpp.in | 4 +- 13 files changed, 222 insertions(+), 391 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index ea1412064..53366f79d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -265,10 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run( - p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -299,12 +299,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy - .template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy - .template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -327,9 +325,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -398,7 +396,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) - .template Run( + .template Run( p_out_thread, p_out_global); } } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index a5a753158..d5d1e496b 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -281,10 +281,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run( - p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -315,12 +315,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy - .template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy - .template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -343,9 +341,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -427,12 +425,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global, 0}) #if 1 - .template Run + .template Run #else // tweaking .template Run_optimized_dst_address_calculation + AddressSpace::generic, + AddressSpace::global> #endif (p_out_thread, p_out_global); } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 55ca61926..39a28e391 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -251,10 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -285,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -311,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -391,8 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { threadwise_out_copy - .template Run( - p_out_thread, p_out_global); + .template Run(p_out_thread, + p_out_global); threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index d39e11de2..e93258682 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -255,10 +255,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run( - p_wei_global, p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -289,12 +289,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy - .template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy - .template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -317,9 +315,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -390,12 +388,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global / B1, b_thread_data_on_global % B1}) #if 1 - .template Run + .template Run #else // tweaking .template Run_optimized_dst_address_calculation + AddressSpace::generic, + AddressSpace::global> #endif (p_out_thread, p_out_global); } diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index cd04c4550..1c7bb92f6 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -143,13 +143,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr auto b_thread_mtx = make_ConstantMatrixDescriptor_packed(Number{}, Number{}); - // thread A-sub, B-sub for copy - constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - - constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); - FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 69e98d4c8..15faeaebf 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -68,8 +68,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace BlockSrcAddressSpace = AddressSpace::generic, + AddressSpace ThreadBufferAddressSpace = AddressSpace::generic> __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, ThreadBufferData* p_thread_buffer) const { @@ -89,8 +89,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace ThreadBufferAddressSpace = AddressSpace::generic, + AddressSpace BlockDstAddressSpace = AddressSpace::generic> __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, BlockDstData* p_block_dst) const { @@ -110,8 +110,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 template + AddressSpace BlockSrcAddressSpace = AddressSpace::generic, + AddressSpace BlockDstAddressSpace = AddressSpace::generic> __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const { BlockSrcData p_thread_buffer[GetThreadBufferSize()]; @@ -119,12 +119,12 @@ struct BlockwiseGenericTensorSliceCopy_v4 RunLoadThreadBuffer(p_block_src, p_thread_buffer); + AddressSpace::generic>(p_block_src, p_thread_buffer); // if there is type conversion, it's done during store RunStoreThreadBuffer(p_thread_buffer, p_block_dst); } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index 881a88771..ca3902039 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -7,10 +7,6 @@ #include "tensor_coordinate_deprecated.hpp" #include "threadwise_generic_tensor_slice_copy_deprecated.hpp" -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 -#endif - namespace ck { // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor @@ -480,8 +476,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace BlockSrcAddressSpace = AddressSpace::generic, + AddressSpace ThreadBufferAddressSpace = AddressSpace::generic> __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const { mThreadwiseLoad @@ -491,8 +487,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace ThreadBufferAddressSpace = AddressSpace::generic, + AddressSpace BlockDstAddressSpace = AddressSpace::generic> __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const { mThreadwiseStore @@ -502,17 +498,17 @@ struct BlockwiseGenericTensorSliceCopy_v2 template + AddressSpace BlockSrcAddressSpace = AddressSpace::generic, + AddressSpace BlockDstAddressSpace = AddressSpace::generic> __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const { SrcData p_thread_buffer[GetThreadBufferSize()]; - RunLoadThreadBuffer( + RunLoadThreadBuffer( p_block_src, p_thread_buffer); // if there is type conversion, it's done during store - RunStoreThreadBuffer( + RunStoreThreadBuffer( p_thread_buffer, p_block_dst); } diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index 503eb9522..0619aaf15 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -91,7 +91,7 @@ struct ThreadwiseGemmTransANormalBNormalC } } -#if CK_USE_AMD_INLINE_ASM +#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM template __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) { @@ -147,8 +147,15 @@ struct ThreadwiseGemmTransANormalBNormalC template __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c) { -#if CK_USE_AMD_INLINE_ASM && CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM - Run_amd_asm(p_a, p_b, p_c); +#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM + constexpr bool has_amd_asm = is_same{} && + ((is_same{} && is_same{}) || + (is_same{} && is_same{}) || + (is_same{} && is_same{})); + + static_if{}([&](auto fwd) { + Run_amd_asm(p_a, p_b, fwd(p_c)); + }).Else([&](auto) { Run_source(p_a, p_b, p_c); }); #else Run_source(p_a, p_b, p_c); #endif diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 0bd147d1b..6a61c2c05 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -10,8 +10,8 @@ #define CK_USE_AMD_INTRINSIC 1 #endif -#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 +#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1 #endif namespace ck { @@ -76,8 +76,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on dst data: No write if dst data is in paddin area. template + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic> __device__ void Run(const SrcData* p_src, DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; @@ -126,8 +126,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( p_src, src_coord.GetOffset(), 0); @@ -167,8 +167,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, @@ -204,8 +204,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // This version is optimized for address calculation of src tensor template + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic> __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, DstData* p_dst) const { @@ -302,8 +302,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); @@ -362,8 +362,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // This version is optimized for address calculation of dst tensor template + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic> __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, DstData* p_dst) const { @@ -491,8 +491,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the dst vector has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index 78684abe9..9f6133f8d 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -6,257 +6,8 @@ #include "ConstantMergedTensorDescriptor.hpp" #include "tensor_coordinate_deprecated.hpp" -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 -#endif - -#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 -#endif - -#ifndef CK_USE_AMD_INTRINSIC -#define CK_USE_AMD_INTRINSIC 1 -#endif - -#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 -#endif - namespace ck { -// This threadwise copy allow vector access of src and dst. -// It allows the dimensions of vector access to be different on src and dst. -// It also allows the vector size to be different on src and dst. -// It also allows order of access to be different on src and dst. -// It use register as buffer to hold all data moving from src to dst. -// It is designed for copying small amount of data, and src and dst are -// device memory or LDS. -// When copying large amout of data, let's hope compiler will reduce register -// used for the buffer. -template -struct ThreadwiseGenericTensorSliceCopy_v1r1 -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1( - Array src_slice_origin, Array dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && - nDim == DstDimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong! map is not valid"); - - static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && - SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto src_vector_access_dim = Number{}; - constexpr auto dst_vector_access_dim = Number{}; - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1() - : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(Array src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(Array dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // copy data from src into buffer - { - using vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 - static_ford{}([&](auto src_access_id) { - constexpr auto src_data_begin_id = src_access_id.Modify( - src_vector_access_dim, - src_access_id[src_vector_access_dim] * src_data_per_access); - - const index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); - - // load vector from src - const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); - - // unpack vector into buffer - static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify(src_vector_access_dim, - i); - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - }); - }); -#else - ford{}([&](auto src_access_id) { - auto src_data_begin_id = src_access_id; - src_data_begin_id(src_vector_access_dim) = - src_access_id[src_vector_access_dim] * src_data_per_access; - - const index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); - - // load vector from src - const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } - }); -#endif - } - - // copy data from buffer to dst - { - using vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 - static_ford{}([&](auto dst_access_id) { - constexpr auto dst_data_begin_id = dst_access_id.Modify( - dst_vector_access_dim, - dst_access_id[dst_vector_access_dim] * dst_data_per_access); - - vector_t vector_data{}; - - // pack vector from buffer - static_for<0, DstDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify(dst_vector_access_dim, - i); - - constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - }); - - const index_t dst_offset = - DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); - - // store vector into dst - *reinterpret_cast(&p_dst[dst_offset]) = vector_data; - }); -#else - ford{}([&](auto dst_access_id) { - auto dst_data_begin_id = dst_access_id; - dst_data_begin_id(dst_vector_access_dim) = - dst_access_id[dst_vector_access_dim] * dst_data_per_access; - - vector_t vector_data{}; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } - - const index_t dst_offset = - DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); - - // store vector into dst - *reinterpret_cast(&p_dst[dst_offset]) = vector_data; - }); -#endif - } - } - - private: - Array mSrcSliceOrigin; - Array mDstSliceOrigin; -}; - // This threadwise copy allow vector access of src and dst. // It allows the vector size to be different on src and dst. // The dimensions of vector access should be the same on src and dst. @@ -539,8 +290,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 template + AddressSpace SrcAddressSpace = AddressSpace::generic, + AddressSpace DstAddressSpace = AddressSpace::generic> __device__ void Run(const SrcData* p_src, DstData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -613,8 +364,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. src_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. src_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC vector_data = __buffer_load( p_src, src_merged_offset, src_normal_offset); #else @@ -722,8 +473,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. dst_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. dst_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); #else diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 2d175852e..c764b27d2 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -3,18 +3,14 @@ #include "vector_type.hpp" -// disable inline asm due to the compiler issue: SWDEV-202749 -#define WORKAROUND_SWDEV_202749 1 - namespace ck { -// cast a pointer of LDS to its address -extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); - +// outer-product: c[i,j] += inner_product(a[i], b[j]) __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) { +// disable inline asm due to the compiler issue: SWDEV-202749 ///\to-do: enable the inline asm after the compiler fix -#if WORKAROUND_SWDEV_202749 +#if CK_WORKAROUND_SWDEV_202749 c0 += a * b0; c1 += a * b1; #else @@ -27,6 +23,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa #endif } +// outer-product: c[i,j] += inner_product(a[i], b[j]) __device__ void __outer_product_1x4( float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3) { @@ -40,5 +37,90 @@ __device__ void __outer_product_1x4( : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3)); } +// outer-product: c[i,j] += inner_product(a[i], b[j]) +__device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1) +{ + asm volatile("\n \ + v_dot2_f32_f16 %0, %2, %3 %0\n \ + v_dot2_f32_f16 %1, %2, %4 %1\n \ + " + : "=v"(c0), "=v"(c1) // Dest registers + : "v"(a), // 1st Src register for 1 half2 registers + "v"(b0), // 2nd Src register + "v"(b1), + "0"(c0), // 3rd Src register + "1"(c1)); +} + +// outer-product: c[i,j] += inner_product(a[i], b[j]) +__device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1) +{ + const half2_t* p_a_half2 = reinterpret_cast(&a); + const half2_t* p_b0_half2 = reinterpret_cast(&b0); + const half2_t* p_b1_half2 = reinterpret_cast(&b1); + + // do dot2 two times + asm volatile("\n \ + v_dot2_f32_f16 %0, %2, %4 %0\n \ + v_dot2_f32_f16 %1, %2, %6 %1\n \ + v_dot2_f32_f16 %0, %3, %5 %0\n \ + v_dot2_f32_f16 %1, %3, %7 %1\n \ + " + : "=v"(c0), "=v"(c1) // Dest registers + : "v"(p_a_half2[0]), + "v"(p_a_half2[1]), // 1st Src registers for 2 half2 registers + "v"(p_b0_half2[0]), + "v"(p_b0_half2[1]), + "v"(p_b1_half2[0]), + "v"(p_b1_half2[1]), // 2nd Src registers for 2 half2 registers + "0"(c0), + "1"(c1)); // 3rd Src Acc registers for 2 half2 registers +} + +// outer-product: c[i,j] += inner_product(a[i], b[j]) +__device__ void __outer_product_1x4(half4_t a, + half4_t b0, + half4_t b1, + half4_t b2, + half4_t b3, + float& c0, + float& c1, + float& c2, + float& c3) +{ + const half2_t* p_a_half2 = reinterpret_cast(&a); + const half2_t* p_b0_half2 = reinterpret_cast(&b0); + const half2_t* p_b1_half2 = reinterpret_cast(&b1); + const half2_t* p_b2_half2 = reinterpret_cast(&b2); + const half2_t* p_b3_half2 = reinterpret_cast(&b3); + + // do dot2 two times + asm volatile("\n \ + v_dot2_f32_f16 %0, %4, %6 %0\n \ + v_dot2_f32_f16 %1, %4, %8 %1\n \ + v_dot2_f32_f16 %2, %4, %10 %2\n \ + v_dot2_f32_f16 %3, %4, %12 %3\n \ + v_dot2_f32_f16 %0, %5, %7 %0\n \ + v_dot2_f32_f16 %1, %5, %9 %1\n \ + v_dot2_f32_f16 %2, %5, %11 %2\n \ + v_dot2_f32_f16 %3, %5, %13 %3\n \ + " + : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers + : "v"(p_a_half2[0]), + "v"(p_a_half2[1]), // 1st Src registers for 2 half2 registers + "v"(p_b0_half2[0]), + "v"(p_b0_half2[1]), + "v"(p_b1_half2[0]), + "v"(p_b1_half2[1]), // 2nd Src registers for 2 half2 registers + "v"(p_b2_half2[0]), + "v"(p_b2_half2[1]), + "v"(p_b3_half2[0]), + "v"(p_b3_half2[1]), // 2nd Src registers for 2 half2 registers + "0"(c0), + "1"(c1), + "2"(c2), + "3"(c3)); // 3rd Src Acc registers for 2 half2 registers +} + } // namespace ck #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 799d5f8a9..fe82ba992 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -12,12 +12,31 @@ #define CK_DEVICE_BACKEND_AMD 1 // AMD inline asm +#ifndef CK_USE_AMD_INLINE_ASM #define CK_USE_AMD_INLINE_ASM 1 +#endif + +#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 +#endif + +// AMD XDLOPS +#ifndef CK_USE_AMD_XDLOPS +#define CK_USE_AMD_XDLOPS 1 +#endif -// AMD intrinsic +#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM +#define CK_USE_AMD_XDLOPS_INLINE_ASM 1 +#endif + +// AMD llvm intrinsic +#ifndef CK_USE_AMD_INTRINSIC #define CK_USE_AMD_INTRINSIC 1 -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 +#endif + +#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1 +#endif // experimental implementation #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 @@ -26,9 +45,12 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +// workaround +#define CK_WORKAROUND_SWDEV_202749 1 + namespace ck { -enum AddressSpace_t +enum AddressSpace { generic, global @@ -40,41 +62,28 @@ using index_t = uint32_t; using index_t = int32_t; #endif -// For some reason, HIP compiler need this definition to generate optimal load and store -// instruction +// For some reason, HIP compiler need this definition to generate optimal ISA // float typedef float float2_t __attribute__((ext_vector_type(2))); typedef float float4_t __attribute__((ext_vector_type(4))); typedef float float32_t __attribute__((ext_vector_type(32))); -typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +// float16 +typedef _Float16 half2_t __attribute__((ext_vector_type(2))); +typedef _Float16 half4_t __attribute__((ext_vector_type(4))); -// half -typedef half2 half2_t; +// bfloat16 +typedef ushort ushort2_t __attribute__((ext_vector_type(2))); +typedef ushort ushort4_t __attribute__((ext_vector_type(4))); -typedef struct -{ - // TODO: why not use "half scalar[4]"? - half2_t scalar[2]; -} half4_t; - -// bfloat16: use ushort -typedef struct -{ - ushort scalar[2]; -} ushort2_t; - -typedef struct -{ - // TODO: why not use "ushort scalar[4]"? - ushort2_t scalar[2]; -} ushort4_t; +// int +typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); // data type conversion -template +template struct type_convert { - template + template __device__ T operator()(X x) const { return static_cast(x); @@ -96,5 +105,4 @@ __device__ ushort type_convert::operator()(float x) const } } // namespace ck - #endif diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 67cd93136..2eea4a867 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -10,7 +10,7 @@ #define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INLINE_ASM 0 -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0 +#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -18,7 +18,7 @@ namespace ck { -enum AddressSpace_t +enum AddressSpace { generic, global = generic From db268ae0c88982dff4ac6bebb594f3c6463e3824 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 19:07:17 -0500 Subject: [PATCH 05/20] mark deprecated code --- ...e_convolution_direct_v2_nchw_kcyx_nkhw.hpp | 2 +- ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp | 2 +- ...tion_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp | 2 +- ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 92 ++++++++++--------- ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 44 ++++----- ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp | 2 +- ...tion_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp | 2 +- ..._v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 2 +- ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp | 2 +- ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp | 2 +- ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp | 33 ++++--- ...mm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 33 ++++--- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 61 ++++++------ ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 30 +++--- ..._v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp | 30 +++--- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 83 +++++++++-------- .../ConstantMatrixDescriptor.hpp | 7 +- ...tantMergedTensorDescriptor_deprecated.hpp} | 18 ++-- ...> ConstantTensorDescriptor_deprecated.hpp} | 57 ++++++------ .../tensor_coordinate_deprecated.hpp | 18 ++-- ...e_generic_tensor_slice_copy_deprecated.hpp | 89 +++++++++--------- .../threadwise_direct_convolution.hpp | 2 +- .../threadwise_generic_tensor_op.hpp | 4 +- ...e_generic_tensor_slice_copy_deprecated.hpp | 26 +++--- .../include/utility/amd_intrinsic.hpp | 50 +++++----- driver/include/conv_common.hpp | 2 +- driver/include/host_conv.hpp | 2 +- driver/src/driver.cpp | 2 +- 28 files changed, 351 insertions(+), 348 deletions(-) rename composable_kernel/include/tensor_description/{ConstantMergedTensorDescriptor.hpp => ConstantMergedTensorDescriptor_deprecated.hpp} (92%) rename composable_kernel/include/tensor_description/{ConstantTensorDescriptor.hpp => ConstantTensorDescriptor_deprecated.hpp} (90%) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp index 495835384..aae74b613 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp @@ -2,7 +2,7 @@ #define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_4d_tensor_op.hpp" #include "threadwise_tensor_slice_copy.hpp" diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp index ce6965ec6..d33a4adf9 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp @@ -2,7 +2,7 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_4d_tensor_op.hpp" #include "blockwise_2d_tensor_op.hpp" diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp index 23c1be527..6975b1e24 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp @@ -2,7 +2,7 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_2d_tensor_op.hpp" #include "blockwise_3d_tensor_op.hpp" diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp index dd3cd21c6..def4ae086 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp @@ -2,7 +2,7 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" @@ -125,38 +125,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // blockwise copy // input: format is [C, Hi, Wi, N] - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, - {0, 0, 0, 0}); + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< + BlockSize, + decltype(in_c_h_w_n_global_desc), + decltype(in_c_h_w_n_block_desc), + decltype(in_c_h_w_n_block_desc.GetLengths()), + InBlockCopySubLengths_CHWN, + InBlockCopyClusterLengths_CHWN, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, X * KPerBlock] const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); + BlockwiseGenericTensorSliceCopy_v1_deprecated, + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, + {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -318,14 +318,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn n_block_data_begin + n_thread_data_begin); #if 1 - ThreadwiseGenericTensorSliceCopy_v1r2::type, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated< + decltype(out_10d_thread_desc), + decltype(out_10d_global_desc), + decltype(out_10d_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>(make_zero_array(), + make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v1r1::type, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated< + decltype(out_10d_thread_desc), + decltype(out_10d_global_desc), + decltype(out_10d_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>(make_zero_array(), + make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v1r1::type, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated< + decltype(out_10d_thread_desc), + decltype(out_10d_global_desc), + decltype(out_10d_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>(make_zero_array(), + make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v1r1::type, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated< + decltype(out_10d_thread_desc), + decltype(out_10d_global_desc), + decltype(out_10d_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 10, 1>::type, + 9, + OutThreadCopyDataPerAccess_N, + OutThreadCopyDataPerAccess_N>(make_zero_array(), + make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v1r1, // thread_arrange_order [C, K] - Sequence<0, 1>, // src_access_order [C, K] - Sequence<0, 1>, // dst_access_order [C, K] - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>( - {0, k_block_data_on_global}, {0, 0}); + auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< + BlockSize, + Float, + decltype(wei_c_k_global_desc), + decltype(wei_c_k_block_desc), + decltype(wei_c_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_C_K, + WeiBlockCopyClusterLengths_C_K, + Sequence<0, 1>, // thread_arrange_order [C, K] + Sequence<0, 1>, // src_access_order [C, K] + Sequence<0, 1>, // dst_access_order [C, K] + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp index 25d73df49..2a08be324 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp @@ -2,8 +2,8 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" @@ -125,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< + const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< BlockSize, Float, decltype(in_c_n1_b_n2_global_merged_desc), @@ -152,20 +152,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in - const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1, // thread_arrange_order [C, K] - Sequence<0, 1>, // src_access_order [C, K] - Sequence<0, 1>, // dst_access_order [C, K] - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>( - {0, k_block_data_on_global}, {0, 0}); + const auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< + BlockSize, + Float, + decltype(wei_c_k_global_desc), + decltype(wei_c_k_block_desc), + decltype(wei_c_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_C_K, + WeiBlockCopyClusterLengths_C_K, + Sequence<0, 1>, // thread_arrange_order [C, K] + Sequence<0, 1>, // src_access_order [C, K] + Sequence<0, 1>, // dst_access_order [C, K] + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0}); // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 53366f79d..1b6c87717 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -2,8 +2,8 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" @@ -157,21 +157,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated< + BlockSize, + decltype(in_e_n1_b_n2_global_merged_desc), + decltype(in_e_n1_b_n2_block_desc), + decltype(in_e_n1_b_n2_block_desc.GetLengths()), + InBlockCopySubLengths_E_N1_B_N2, + InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + 2, + 3, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -188,19 +187,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -381,7 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col / N2; - ThreadwiseGenericTensorSliceCopy_v2r1< + ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< decltype(out_k0_k1_n1_b_n2_thread_mem_desc), decltype(out_k0_k1_n1_b_n2_global_merged_desc), decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()), diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index bedaa0cad..3fe68ca3a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -2,8 +2,8 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R2_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" @@ -166,7 +166,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< BlockSize, Float, decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc), @@ -196,18 +196,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1( + BlockwiseGenericTensorSliceCopy_v1_deprecated( {0, k_block_data_on_global}, {0, 0}); // GEMM definition diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp index c894f69bd..bc50bf19c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -2,8 +2,8 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R3_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" @@ -165,7 +165,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated< BlockSize, Float, decltype(in_e_n1_ho1_wo1_b_n2_ho2_wo2_global_merged_desc), @@ -195,18 +195,18 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v1( + BlockwiseGenericTensorSliceCopy_v1_deprecated( {0, k_block_data_on_global}, {0, 0}); #if 0 diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 39a28e391..e741a83c4 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -2,8 +2,8 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" @@ -133,19 +133,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, b_block_data_on_global}, {0, 0}); // weight tensor @@ -169,19 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -373,20 +373,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer using OutThreadCopySliceLengths = Sequence; - auto threadwise_out_copy = - ThreadwiseGenericTensorSliceCopy_v2r1::type, - arithmetic_sequence_gen<0, 3, 1>::type, - 2, - 2, - OutThreadCopyDataPerAccess_B, - OutThreadCopyDataPerAccess_B>( - {0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global}); + auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< + decltype(out_k0_k1_b_thread_desc), + decltype(out_k0_k1_b_global_desc), + OutThreadCopySliceLengths, + arithmetic_sequence_gen<0, 3, 1>::type, + arithmetic_sequence_gen<0, 3, 1>::type, + 2, + 2, + OutThreadCopyDataPerAccess_B, + OutThreadCopyDataPerAccess_B>({0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global}); for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index ada40e8ba..0ebd9dc4a 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -2,7 +2,7 @@ #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "tensor_descriptor.hpp" namespace ck { @@ -59,9 +59,10 @@ __host__ __device__ constexpr auto } template -__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor) +__host__ __device__ constexpr auto + make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { - using TDesc = ConstantTensorDescriptor; + using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); static_assert(TDesc::GetStrides()[1] == 1, "wrong"); return ConstantMatrixDescriptor +// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...> // it's the tensor whose dimensions are to be merged // OriginalDimMergeSeqs : Sequence<...>... // each is a sequence of original dimensions (of OriginalTensorDesc) to be merged template -struct ConstantMergedTensorDescriptor +struct ConstantMergedTensorDescriptor_deprecated { - using Type = ConstantMergedTensorDescriptor; + using Type = ConstantMergedTensorDescriptor_deprecated; static constexpr auto mOriginalDimMergeSeqs = std::tuple{}; static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension(); - __host__ __device__ constexpr ConstantMergedTensorDescriptor() + __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated() { static_assert(nDim <= nOriginalDim, "wrong!"); @@ -189,7 +189,7 @@ struct ConstantMergedTensorDescriptor { constexpr auto lengths = GetLengths(); constexpr auto strides = calculate_tensor_strides_packed(lengths); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } }; @@ -197,7 +197,7 @@ template __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...) { - return ConstantMergedTensorDescriptor{}; + return ConstantMergedTensorDescriptor_deprecated{}; } template diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp similarity index 90% rename from composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp rename to composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp index 6dfbe5f79..d14696414 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp @@ -1,5 +1,5 @@ -#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP -#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP +#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP +#define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP #include "common_header.hpp" @@ -24,13 +24,13 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, } template -struct ConstantTensorDescriptor +struct ConstantTensorDescriptor_deprecated { - using Type = ConstantTensorDescriptor; + using Type = ConstantTensorDescriptor_deprecated; static constexpr index_t nDim = Lengths::GetSize(); - __host__ __device__ constexpr ConstantTensorDescriptor() + __host__ __device__ constexpr ConstantTensorDescriptor_deprecated() { static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent"); } @@ -284,7 +284,7 @@ struct ConstantTensorDescriptor using extract_lengths = decltype(Lengths::Extract(extract_dims...)); using extract_strides = decltype(Strides::Extract(extract_dims...)); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -294,13 +294,13 @@ struct ConstantTensorDescriptor } template - __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor) + __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor_deprecated) { - using leaf_tensor = ConstantTensorDescriptor; + using leaf_tensor = ConstantTensorDescriptor_deprecated; - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated< + decltype(GetLengths().PushBack(leaf_tensor::GetLengths())), + decltype(GetStrides().PushBack(leaf_tensor::GetStrides()))>{}; } template @@ -351,7 +351,7 @@ struct ConstantTensorDescriptor using vectorized_strides = decltype((Strides{} / Number{}).Modify(Number{}, Number<1>{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -359,7 +359,7 @@ struct ConstantTensorDescriptor { using slice_lengths = decltype(Lengths::Modify(Number{}, Number{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -367,7 +367,7 @@ struct ConstantTensorDescriptor { static_assert(slice_lengths.GetSize() == nDim, "wrong!"); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -379,7 +379,7 @@ struct ConstantTensorDescriptor using new_lengths = decltype(Lengths::Modify(Number{}, Number{})); using new_strides = decltype(Strides::Modify(Number{}, Number{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -418,7 +418,7 @@ struct ConstantTensorDescriptor constexpr auto new_strides = GetStrides().Extract(left).PushBack(fold_strides).PushBack(GetStrides().Extract(right)); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template @@ -462,27 +462,29 @@ struct ConstantTensorDescriptor .PushBack(Number{}) .PushBack(GetStrides().Extract(right)); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } __host__ __device__ static constexpr auto Pack() { using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) { - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated< + decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})), + decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{}; } template __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) { - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated< + decltype(Lengths::ReorderGivenOld2New(MapOld2New{})), + decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{}; } }; @@ -490,26 +492,25 @@ template __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) { using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) { - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number) { using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor_deprecated{}; } template -__host__ __device__ void -print_ConstantTensorDescriptor(const char* s, - ConstantTensorDescriptor, Sequence>) +__host__ __device__ void print_ConstantTensorDescriptor( + const char* s, ConstantTensorDescriptor_deprecated, Sequence>) { constexpr index_t ndim = sizeof...(Lengths); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index 46e551ddd..aaddc1251 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -2,12 +2,12 @@ #define CK_TENSOR_COORDINATE_DEPRECATED_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" namespace ck { -// TensorDesc is ConstantTensorDescriptor +// TensorDesc is ConstantTensorDescriptor_deprecated template struct NormalTensorCoordinate_deprecated { @@ -95,7 +95,7 @@ struct NormalTensorCoordinate_deprecated index_t mOffset; }; -// TensorDesc is ConstantMergedTensorDescriptor +// TensorDesc is ConstantMergedTensorDescriptor_deprecated template struct MergedTensorCoordinate { @@ -311,7 +311,7 @@ struct MergedTensorCoordinate // dimensions, and those merged dimensions, that would never be involved in index // arithmetic after construction of TensorCoordinate. // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" - // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to + // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to // count on compiler to optimize away those register memory for us Array mOriginalIndex; Array mPartialOffsets; @@ -326,16 +326,16 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { - return NormalTensorCoordinate_deprecated>(); + return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { - return MergedTensorCoordinate>(); + return MergedTensorCoordinate>(); } public: diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index ca3902039..c922384a9 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -2,8 +2,8 @@ #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "tensor_coordinate_deprecated.hpp" #include "threadwise_generic_tensor_slice_copy_deprecated.hpp" @@ -16,7 +16,7 @@ namespace ck { // that, on a merged dimension that constains multiple original dimensions, the length of // the last original dimension need to be evenly dividable by its sub-lengths. Also, the // repeat-length on the merged dimension need to be 1. These sanity checks are performed -// in constructor of BlockwiseGenericTensorSliceCopy_v1 +// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated template -struct BlockwiseGenericTensorSliceCopy_v1 +struct BlockwiseGenericTensorSliceCopy_v1_deprecated { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); @@ -58,7 +58,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 Array mThreadSrcOriginalMultiId; Array mThreadDstOriginalMultiId; - __device__ BlockwiseGenericTensorSliceCopy_v1(Array src_block_data_id_begin, + __device__ + BlockwiseGenericTensorSliceCopy_v1_deprecated(Array src_block_data_id_begin, Array dst_block_data_id_begin) { // check NDim consistency @@ -240,15 +241,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 // that constains multiple original dimensions, the length of the last original // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1 - ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) + // of BlockwiseGenericTensorSliceCopy_v1_deprecated + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array(), + make_zero_array()) .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); }); } @@ -295,14 +296,14 @@ struct BlockwiseGenericTensorSliceCopy_v1 // that constains multiple original dimensions, the length of the last original // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1 - ThreadwiseGenericTensorSliceCopy_v1r2( + // of BlockwiseGenericTensorSliceCopy_v1_deprecated + ThreadwiseGenericTensorSliceCopy_v1r2_deprecated( make_zero_array(), make_zero_array()) .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); }); @@ -428,14 +429,14 @@ template -struct BlockwiseGenericTensorSliceCopy_v2 +struct BlockwiseGenericTensorSliceCopy_v2_deprecated { static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); using Index = MultiIndex; - __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin, - const Index& dst_block_slice_origin) + __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated( + const Index& src_block_slice_origin, const Index& dst_block_slice_origin) { static_assert( nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && @@ -529,25 +530,25 @@ struct BlockwiseGenericTensorSliceCopy_v2 private: using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1; - - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1; + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated; + + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated; ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; diff --git a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp index 3e84cbd8b..bae080b04 100644 --- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp @@ -2,7 +2,7 @@ #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "threadwise_tensor_slice_copy.hpp" namespace ck { diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp index c0b4e8939..8b83b68c7 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp @@ -2,8 +2,8 @@ #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" namespace ck { template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index 9f6133f8d..0310addd3 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -2,8 +2,8 @@ #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "tensor_coordinate_deprecated.hpp" namespace ck { @@ -21,11 +21,11 @@ template -struct ThreadwiseGenericTensorSliceCopy_v1r2 +struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated { static constexpr index_t nDim = SliceLengths::GetSize(); - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2( + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated( Array src_slice_origin, Array dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { @@ -64,9 +64,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 }); } - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() - : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), - make_zero_array()) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated() + : ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array(), + make_zero_array()) { } @@ -204,7 +204,7 @@ template -struct ThreadwiseGenericTensorSliceCopy_v2r1 +struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated { static constexpr index_t nDim = SliceLengths::GetSize(); @@ -213,8 +213,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 using SrcCoordinate = typename TensorCoordinate_deprecated::type; using DstCoordinate = typename TensorCoordinate_deprecated::type; - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin, - const Index& dst_slice_origin) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated( + const Index& src_slice_origin, const Index& dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { static_assert(nDim == SrcDesc::GetNumOfDimension() && @@ -262,9 +262,9 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 }); } - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1() - : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array(), - make_zero_array()) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated() + : ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(make_zero_array(), + make_zero_array()) { } diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp index 193a55bc7..0927b2c62 100644 --- a/composable_kernel/include/utility/amd_intrinsic.hpp +++ b/composable_kernel/include/utility/amd_intrinsic.hpp @@ -11,19 +11,17 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load"); -__device__ vector_type::MemoryType -__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); - -__device__ vector_type::MemoryType -__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); +__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); + +__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); __device__ void __llvm_amdgcn_buffer_store(float vdata, int32x4_t rsrc, @@ -32,14 +30,14 @@ __device__ void __llvm_amdgcn_buffer_store(float vdata, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store"); -__device__ void __llvm_amdgcn_buffer_storex2(vector_type::MemoryType vdata, +__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata, int32x4_t rsrc, uint32_t vindex, uint32_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); -__device__ void __llvm_amdgcn_buffer_storex4(vector_type::MemoryType vdata, +__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, int32x4_t rsrc, uint32_t vindex, uint32_t offset, @@ -106,11 +104,12 @@ __device__ float __buffer_load(const float* p_src_block, } template <> -__device__ vector_type::MemoryType __buffer_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) +__device__ float2_t __buffer_load(const float* p_src_block, + uint32_t src_thread_data_offset, + uint32_t src_const_data_offset) { #if 0 - vector_type::MemoryType dst; + float2_t dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); @@ -132,7 +131,7 @@ __device__ vector_type::MemoryType __buffer_load( return dst; #else - vector_type::MemoryType dst; + float2_t dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); @@ -153,11 +152,12 @@ __device__ vector_type::MemoryType __buffer_load( } template <> -__device__ vector_type::MemoryType __buffer_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) +__device__ float4_t __buffer_load(const float* p_src_block, + uint32_t src_thread_data_offset, + uint32_t src_const_data_offset) { #if 0 - vector_type::MemoryType dst; + float4_t dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); @@ -179,7 +179,7 @@ __device__ vector_type::MemoryType __buffer_load( return dst; #elif 1 - vector_type::MemoryType dst; + float4_t dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); @@ -243,7 +243,7 @@ __device__ void __buffer_store(const float& src, } template <> -__device__ void __buffer_store(const vector_type::MemoryType& src, +__device__ void __buffer_store(const float2_t& src, float* p_dst_block, uint32_t dst_thread_data_offset, uint32_t dst_const_data_offset) @@ -286,7 +286,7 @@ __device__ void __buffer_store(const vector_type::MemoryType } template <> -__device__ void __buffer_store(const vector_type::MemoryType& src, +__device__ void __buffer_store(const float4_t& src, float* p_dst_block, uint32_t dst_thread_data_offset, uint32_t dst_const_data_offset) diff --git a/driver/include/conv_common.hpp b/driver/include/conv_common.hpp index 636e22290..f37645df2 100644 --- a/driver/include/conv_common.hpp +++ b/driver/include/conv_common.hpp @@ -1,7 +1,7 @@ #ifndef CONV_COMMON_HPP #define CONV_COMMON_HPP -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" // this is ugly, only for 4d template diff --git a/driver/include/host_conv.hpp b/driver/include/host_conv.hpp index 9ca7fc10d..880fd5efe 100644 --- a/driver/include/host_conv.hpp +++ b/driver/include/host_conv.hpp @@ -1,7 +1,7 @@ #pragma once #include "tensor.hpp" #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" // this is ugly, only for 4d template diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 4319c4f7d..9d47b96f9 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -4,7 +4,7 @@ #include #include #include "config.hpp" -#include "ConstantTensorDescriptor.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" #include "device.hpp" #include "conv_common.hpp" #include "host_conv.hpp" From abe9c0bf7b860fa2eb82283a69e1d417cdd46495 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 19:13:14 -0500 Subject: [PATCH 06/20] change data type of offset arguments of __buffer_load and __buffer_store from uint32_t to index_t --- .../include/utility/amd_intrinsic.hpp | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp index 0927b2c62..a5bbd8782 100644 --- a/composable_kernel/include/utility/amd_intrinsic.hpp +++ b/composable_kernel/include/utility/amd_intrinsic.hpp @@ -6,65 +6,65 @@ namespace ck { __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load"); __device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); __device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); __device__ void __llvm_amdgcn_buffer_store(float vdata, int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store"); __device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata, int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, + index_t vindex, + index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); // buffer_load and buffer_store template -__device__ typename vector_type::MemoryType __buffer_load( - const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); +__device__ typename vector_type::MemoryType +__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); template __device__ void __buffer_store(const typename vector_type::MemoryType& src, T* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset); + index_t dst_thread_data_offset, + index_t dst_const_data_offset); template <> __device__ float __buffer_load(const float* p_src_block, - uint32_t src_thread_data_offset, - uint32_t src_const_data_offset) + index_t src_thread_data_offset, + index_t src_const_data_offset) { #if 0 float dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -85,8 +85,8 @@ __device__ float __buffer_load(const float* p_src_block, #else float dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -105,14 +105,14 @@ __device__ float __buffer_load(const float* p_src_block, template <> __device__ float2_t __buffer_load(const float* p_src_block, - uint32_t src_thread_data_offset, - uint32_t src_const_data_offset) + index_t src_thread_data_offset, + index_t src_const_data_offset) { #if 0 float2_t dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -133,8 +133,8 @@ __device__ float2_t __buffer_load(const float* p_src_block, #else float2_t dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -153,14 +153,14 @@ __device__ float2_t __buffer_load(const float* p_src_block, template <> __device__ float4_t __buffer_load(const float* p_src_block, - uint32_t src_thread_data_offset, - uint32_t src_const_data_offset) + index_t src_thread_data_offset, + index_t src_const_data_offset) { #if 0 float4_t dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -181,8 +181,8 @@ __device__ float4_t __buffer_load(const float* p_src_block, #elif 1 float4_t dst; - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); int32x4_t src_block_setting{0}; // fill in byte 0 - 1 @@ -202,12 +202,12 @@ __device__ float4_t __buffer_load(const float* p_src_block, template <> __device__ void __buffer_store(const float& src, float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) + index_t dst_thread_data_offset, + index_t dst_const_data_offset) { #if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -226,8 +226,8 @@ __device__ void __buffer_store(const float& src, "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); #else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -245,12 +245,12 @@ __device__ void __buffer_store(const float& src, template <> __device__ void __buffer_store(const float2_t& src, float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) + index_t dst_thread_data_offset, + index_t dst_const_data_offset) { #if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -269,8 +269,8 @@ __device__ void __buffer_store(const float2_t& src, "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); #else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -288,12 +288,12 @@ __device__ void __buffer_store(const float2_t& src, template <> __device__ void __buffer_store(const float4_t& src, float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) + index_t dst_thread_data_offset, + index_t dst_const_data_offset) { #if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 @@ -312,8 +312,8 @@ __device__ void __buffer_store(const float4_t& src, "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); #else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 From f2a2c583744b0d183285b70c8edeec83f548df5b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 19:51:03 -0500 Subject: [PATCH 07/20] nvidia build --- CMakeLists.txt | 6 +- .../tensor_operation/threadwise_gemm.hpp | 2 +- .../include/utility/common_header.hpp | 1 + .../{config_amd.hpp.in => config.amd.hpp.in} | 63 +++++++++++++++++++ ...fig_nvidia.hpp.in => config.nvidia.hpp.in} | 32 ++++------ .../include/utility/float_type.nvidia.hpp.in | 51 +++++++++++++++ composable_kernel/include/utility/math.hpp | 62 ------------------ 7 files changed, 132 insertions(+), 85 deletions(-) rename composable_kernel/include/utility/{config_amd.hpp.in => config.amd.hpp.in} (59%) rename composable_kernel/include/utility/{config_nvidia.hpp.in => config.nvidia.hpp.in} (65%) create mode 100644 composable_kernel/include/utility/float_type.nvidia.hpp.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 9798220ca..21e5dc682 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,9 +52,11 @@ include_directories(BEFORE ) if(DEVICE_BACKEND STREQUAL "AMD") - configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") elseif(DEVICE_BACKEND STREQUAL "NVIDIA") - configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp") + configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp") endif() add_subdirectory(driver) diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp index 0619aaf15..00d81410e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp @@ -85,7 +85,7 @@ struct ThreadwiseGemmTransANormalBNormalC const index_t cindex = MatrixC::CalculateOffset(m, n); p_c[cindex] += - math::inner_product_with_conversion{}(p_a[aindex], p_b[bindex]); + inner_product_with_conversion{}(p_a[aindex], p_b[bindex]); } } } diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index ad6b26735..441eecae9 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -2,6 +2,7 @@ #define CK_COMMON_HEADER_HPP #include "config.hpp" +#include "float_type.hpp" #include "utility.hpp" #include "integral_constant.hpp" #include "number.hpp" diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in similarity index 59% rename from composable_kernel/include/utility/config_amd.hpp.in rename to composable_kernel/include/utility/config.amd.hpp.in index fe82ba992..b3349e425 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -104,5 +104,68 @@ __device__ ushort type_convert::operator()(float x) const return float_to_bfloat16(x); } +template +struct inner_product_with_conversion +{ + static constexpr auto convert = type_convert(); + + __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } + + __device__ T operator()(half2_t a, half2_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + + return acc; + } + +#if CK_DEVICE_BACKEND_AMD + __device__ T operator()(half4_t a, half4_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + return acc; + } + + __device__ T operator()(ushort2_t a, ushort2_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + + return acc; + } + + __device__ T operator()(ushort4_t a, ushort4_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + return acc; + } +#endif +}; } // namespace ck #endif diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config.nvidia.hpp.in similarity index 65% rename from composable_kernel/include/utility/config_nvidia.hpp.in rename to composable_kernel/include/utility/config.nvidia.hpp.in index 2eea4a867..6e9198893 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config.nvidia.hpp.in @@ -6,11 +6,22 @@ #include "nvToolsExt.h" #include "helper_cuda.h" +// index type: unsigned or signed #define CK_UNSIGNED_INDEX_TYPE 0 + +// device backend #define CK_DEVICE_BACKEND_NVIDIA 1 -#define CK_USE_AMD_INTRINSIC 0 + +// disable AMD inline asm and intrinsic #define CK_USE_AMD_INLINE_ASM 0 +#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0 +#define CK_USE_AMD_XDLOPS 0 +#define CK_USE_AMD_XDLOPS_INLINE_ASM 0 +#define CK_USE_AMD_INTRINSIC 0 #define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0 + +// experimental implementation +#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -30,24 +41,5 @@ using index_t = uint32_t; using index_t = int32_t; #endif -// For some reason, CUDA need this definition, otherwise -// compiler won't generate optimal load and store instruction, and -// kernel would produce wrong result, indicating the compiler fail to generate correct -// instruction, -using float2_t = float2; -using float4_t = float4; - -// data type conversion -template -struct type_convert -{ - template - __device__ T operator()(const X& x) const - { - return static_cast(x); - } -}; - } // namespace ck - #endif diff --git a/composable_kernel/include/utility/float_type.nvidia.hpp.in b/composable_kernel/include/utility/float_type.nvidia.hpp.in new file mode 100644 index 000000000..fbb93a437 --- /dev/null +++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in @@ -0,0 +1,51 @@ +#ifndef CK_FLOAT_TYPE_NVIDIA_HPP +#define CK_FLOAT_TYPE_NVIDIA_HPP + +namespace ck { + +// For some reason, CUDA need this definition, otherwise +// compiler won't generate optimal load and store instruction, and +// kernel would produce wrong result, indicating the compiler fail to generate correct +// instruction, +// float +using float2_t = float2; +using float4_t = float4; + +// float16 +using half2_t = half2; + +// data type conversion +template +struct type_convert +{ + template + __device__ T operator()(const X& x) const + { + return static_cast(x); + } +}; + +template +struct inner_product_with_conversion +{ + static constexpr auto convert = type_convert(); + + __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } + + __device__ T operator()(half2_t a, half2_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + + return acc; + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp index f6c41cc52..ba70e7ab2 100644 --- a/composable_kernel/include/utility/math.hpp +++ b/composable_kernel/include/utility/math.hpp @@ -117,68 +117,6 @@ struct less __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; } }; -template -struct inner_product_with_conversion -{ - static constexpr auto convert = type_convert(); - - __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } - - __device__ T operator()(half2_t a, half2_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - - return acc; - } - - __device__ T operator()(half4_t a, half4_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - return acc; - } - - __device__ T operator()(ushort2_t a, ushort2_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - - return acc; - } - - __device__ T operator()(ushort4_t a, ushort4_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - return acc; - } -}; - } // namespace math } // namspace ck From 434e4f2596afb307cf6e506898eac919c128090b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 20:08:05 -0500 Subject: [PATCH 08/20] amd build, reorganized files --- CMakeLists.txt | 1 + .../include/utility/config.amd.hpp.in | 104 +---------------- .../include/utility/float_type.amd.hpp.in | 110 ++++++++++++++++++ .../include}/bfloat16_dev.hpp | 0 4 files changed, 112 insertions(+), 103 deletions(-) create mode 100644 composable_kernel/include/utility/float_type.amd.hpp.in rename {composable_kernel/include/utility => external/include}/bfloat16_dev.hpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21e5dc682..20fc8028f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation ${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm + ${PROJECT_SOURCE_DIR}/external/include ${PROJECT_SOURCE_DIR}/driver/include ${PROJECT_BINARY_DIR}/composable_kernel/include/utility ) diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index b3349e425..1da362b81 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -62,110 +62,8 @@ using index_t = uint32_t; using index_t = int32_t; #endif -// For some reason, HIP compiler need this definition to generate optimal ISA -// float -typedef float float2_t __attribute__((ext_vector_type(2))); -typedef float float4_t __attribute__((ext_vector_type(4))); -typedef float float32_t __attribute__((ext_vector_type(32))); - -// float16 -typedef _Float16 half2_t __attribute__((ext_vector_type(2))); -typedef _Float16 half4_t __attribute__((ext_vector_type(4))); - -// bfloat16 -typedef ushort ushort2_t __attribute__((ext_vector_type(2))); -typedef ushort ushort4_t __attribute__((ext_vector_type(4))); - -// int +// int32x4_t use by buffer_load and buffer_store llvm intrinsic typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); -// data type conversion -template -struct type_convert -{ - template - __device__ T operator()(X x) const - { - return static_cast(x); - } -}; - -template <> -template <> -__device__ float type_convert::operator()(ushort x) const -{ - return bfloat16_to_float(x); -} - -template <> -template <> -__device__ ushort type_convert::operator()(float x) const -{ - return float_to_bfloat16(x); -} - -template -struct inner_product_with_conversion -{ - static constexpr auto convert = type_convert(); - - __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } - - __device__ T operator()(half2_t a, half2_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - - return acc; - } - -#if CK_DEVICE_BACKEND_AMD - __device__ T operator()(half4_t a, half4_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - return acc; - } - - __device__ T operator()(ushort2_t a, ushort2_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - - return acc; - } - - __device__ T operator()(ushort4_t a, ushort4_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - return acc; - } -#endif -}; } // namespace ck #endif diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in new file mode 100644 index 000000000..337d12fa3 --- /dev/null +++ b/composable_kernel/include/utility/float_type.amd.hpp.in @@ -0,0 +1,110 @@ +#ifndef CK_FLOAT_TYPE_AMD_HPP +#define CK_FLOAT_TYPE_AMD_HPP + +#include "bfloat16_dev.hpp" + +namespace ck { + +// For some reason, HIP compiler need this definition to generate optimal ISA +// float +typedef float float2_t __attribute__((ext_vector_type(2))); +typedef float float4_t __attribute__((ext_vector_type(4))); +typedef float float32_t __attribute__((ext_vector_type(32))); + +// float16 +typedef _Float16 half2_t __attribute__((ext_vector_type(2))); +typedef _Float16 half4_t __attribute__((ext_vector_type(4))); + +// bfloat16 +typedef ushort ushort2_t __attribute__((ext_vector_type(2))); +typedef ushort ushort4_t __attribute__((ext_vector_type(4))); + +// data type conversion +template +struct type_convert +{ + template + __device__ T operator()(X x) const + { + return static_cast(x); + } +}; + +template <> +template <> +__device__ float type_convert::operator()(ushort x) const +{ + return bfloat16_to_float(x); +} + +template <> +template <> +__device__ ushort type_convert::operator()(float x) const +{ + return float_to_bfloat16(x); +} + +template +struct inner_product_with_conversion +{ + static constexpr auto convert = type_convert(); + + __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } + + __device__ T operator()(half2_t a, half2_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + + return acc; + } + + __device__ T operator()(half4_t a, half4_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + return acc; + } + + __device__ T operator()(ushort2_t a, ushort2_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + + return acc; + } + + __device__ T operator()(ushort4_t a, ushort4_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + return acc; + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/bfloat16_dev.hpp b/external/include/bfloat16_dev.hpp similarity index 100% rename from composable_kernel/include/utility/bfloat16_dev.hpp rename to external/include/bfloat16_dev.hpp From e080041bd317b80f3be5eb522dac52edb6454917 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 3 Oct 2019 23:47:44 -0500 Subject: [PATCH 09/20] mark deprecated code --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 276 +++++++++------- ...cyx_nkhw_lds_double_buffer_deprecated.hpp} | 282 +++++++--------- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 300 ++++++++--------- ...cyx_nkhw_lds_double_buffer_deprecated.hpp} | 302 +++++++++--------- .../include/utility/float_type.amd.hpp.in | 2 - ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 109 ++++--- ...t_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp} | 131 ++++---- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 14 +- ...t_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp} | 36 +-- driver/src/driver.cpp | 56 ++-- 10 files changed, 753 insertions(+), 755 deletions(-) rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp => gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp} (55%) rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp => gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp} (56%) rename driver/include/{device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp => device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp} (76%) rename driver/include/{device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp => device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp} (92%) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 1b6c87717..724a042c9 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -2,24 +2,26 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor_deprecated.hpp" -#include "ConstantMergedTensorDescriptor_deprecated.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { // define B = merge(N0, Ho, Wo) template struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer @@ -67,20 +69,21 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; - constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; - constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); - constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); - constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); - constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3); + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); @@ -113,39 +116,43 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_cluster_descriptor(Sequence{}); - const auto block_work_multi_id = - block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; // input tensor - // tensor descriptor in device memory [N0, N1, N2, Ho, Wo] - constexpr auto in_n0_n1_n2_h_w_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) - .Fold(I0, Number{}, Number{}) - .Extract(Sequence<0, 1, 2, 4, 5>{}); - - // batch descritpor for device memory - constexpr auto in_c_y_x_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) - .Extract(Sequence<1, 2, 3>{}); - - // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy - constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( - in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc), - Sequence<0, 1, 2>{}, - Sequence<4>{}, - Sequence<3, 6, 7>{}, - Sequence<5>{}); + // global memory + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(UnMerge>{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); + + constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor( + in_n0_n1_n2_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); // this check is ad-hoc @@ -157,49 +164,56 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated< - BlockSize, - decltype(in_e_n1_b_n2_global_merged_desc), - decltype(in_e_n1_b_n2_block_desc), - decltype(in_e_n1_b_n2_block_desc.GetLengths()), - InBlockCopySubLengths_E_N1_B_N2, - InBlockCopyClusterLengths_E_N1_B_N2, - InBlockCopyThreadClusterArrangeOrder, - InBlockCopySrcAccessOrder, - InBlockCopyDstAccessOrder, - 2, - 3, - InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v4( + {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_e_k_global_desc = - wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2_deprecated( + BlockwiseGenericTensorSliceCopy_v4( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -210,8 +224,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // register constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc); - constexpr auto b_e_n1bn2_block_mtx_desc = - make_ConstantMatrixDescriptor(in_e_n1_b_n2_block_desc.Unfold(I1, I3)); + constexpr auto b_e_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor( + in_e_n1_b_n2_block_desc.GetLength(I0), + in_e_n1_b_n2_block_desc.GetLength(I1) * in_e_n1_b_n2_block_desc.GetLength(I2) * + in_e_n1_b_n2_block_desc.GetLength(I3), + in_e_n1_b_n2_block_desc.GetStride(I0)); // sanity check static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == @@ -223,14 +240,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); + constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, decltype(a_e_k_block_mtx_desc), decltype(b_e_n1bn2_block_mtx_desc), - decltype(c_k0k1_n1n2_thread_mtx_desc), + decltype(c_k0k2_n1n2_thread_mtx_desc), GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -257,10 +274,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; + Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output - threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread); + threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); // LDS double buffer: preload data into LDS { @@ -350,24 +367,38 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // copy output: register to global memory { constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t K0 = K / K1; // define tensor descriptor for threadwise copy // output memory layout descriptor in register, src of threadwise copy - constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed( + constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed( Sequence{}); // output memory layout descriptor in device memory - constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc = - out_n_k_h_w_global_desc.Fold(I1, Number{}).Fold(I0, Number{}, Number{}); + constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(UnMerge>{}, + UnMerge>{}, + PassThrough{}, + PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{})); // output merged global tensor descriptor, dst of threadwise copy - constexpr auto out_k0_k1_n1_b_n2_global_merged_desc = - make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc, - Sequence<3>{}, - Sequence<4>{}, - Sequence<1>{}, - Sequence<0, 5, 6>{}, - Sequence<2>{}); + constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor( + out_n0_n1_n2_k0_k1_ho_wo_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3>{}, + Sequence<4>{}, + Sequence<1>{}, + Sequence<0, 5, 6>{}, + Sequence<2>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -380,26 +411,31 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col / N2; - ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< - decltype(out_k0_k1_n1_b_n2_thread_mem_desc), - decltype(out_k0_k1_n1_b_n2_global_merged_desc), - decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()), - arithmetic_sequence_gen<0, 5, 1>::type, - arithmetic_sequence_gen<0, 5, 1>::type, - 3, - 3, - 1, - 1>({0, 0, 0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - 0, - b_thread_data_on_global, - 0}) - .template Run( - p_out_thread, p_out_global); + ThreadwiseGenericTensorSliceCopy_v4r2::type, + 3, + 1, + 1>({0, 0, 0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + 0, + b_thread_data_on_global, + 0}) +#if 1 + .template Run +#else // tweaking + .template Run_optimized_dst_address_calculation +#endif + (p_out_thread, p_out_global); } } }; } // namespace ck -#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp similarity index 55% rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp index d5d1e496b..267e8e0a6 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp @@ -1,27 +1,25 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP #include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" +#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { // define B = merge(N0, Ho, Wo) template -struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer +struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -69,21 +67,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; + constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; - constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; - constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; - constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; + constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; - constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); - constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); - constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); + constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); - constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); - constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); - constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); + constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3); constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); @@ -116,43 +113,39 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; // input tensor - // global memory - constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( - in_n_c_hi_wi_global_desc, - make_tuple( - PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); - - constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( - in_n_c_hip_wip_global_desc, - make_tuple(UnMerge>{}, - PassThrough{}, - Embed, Sequence>{}, - Embed, Sequence>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); - - constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor( - in_n0_n1_n2_c_y_ho_x_wo_global_desc, - make_tuple(Merge>{}, - PassThrough{}, - Merge>{}, - PassThrough{}), - make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + // tensor descriptor in device memory [N0, N1, N2, Ho, Wo] + constexpr auto in_n0_n1_n2_h_w_global_desc = + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) + .Fold(I0, Number{}, Number{}) + .Extract(Sequence<0, 1, 2, 4, 5>{}); + + // batch descritpor for device memory + constexpr auto in_c_y_x_global_desc = + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) + .Extract(Sequence<1, 2, 3>{}); + + // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy + constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( + in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc), + Sequence<0, 1, 2>{}, + Sequence<4>{}, + Sequence<3, 6, 7>{}, + Sequence<5>{}); // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned( + constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // this check is ad-hoc @@ -164,56 +157,49 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v4( - {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated< + BlockSize, + decltype(in_e_n1_b_n2_global_merged_desc), + decltype(in_e_n1_b_n2_block_desc), + decltype(in_e_n1_b_n2_block_desc.GetLengths()), + InBlockCopySubLengths_E_N1_B_N2, + InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + 2, + 3, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); + constexpr auto wei_e_k_global_desc = + wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( + constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - // this check is ad-hoc - // TODO: need to properly implement tensor descriptor with multiple alignment - // requirements - static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, - "GemmDataPerReadA alignment requirement is not satisfied"); - // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v4( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -224,11 +210,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // register constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc); - constexpr auto b_e_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor( - in_e_n1_b_n2_block_desc.GetLength(I0), - in_e_n1_b_n2_block_desc.GetLength(I1) * in_e_n1_b_n2_block_desc.GetLength(I2) * - in_e_n1_b_n2_block_desc.GetLength(I3), - in_e_n1_b_n2_block_desc.GetStride(I0)); + constexpr auto b_e_n1bn2_block_mtx_desc = + make_ConstantMatrixDescriptor(in_e_n1_b_n2_block_desc.Unfold(I1, I3)); // sanity check static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) == @@ -240,14 +223,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); + constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, decltype(a_e_k_block_mtx_desc), decltype(b_e_n1bn2_block_mtx_desc), - decltype(c_k0k2_n1n2_thread_mtx_desc), + decltype(c_k0k1_n1n2_thread_mtx_desc), GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -274,10 +257,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; + Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output - threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); + threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread); // LDS double buffer: preload data into LDS { @@ -367,38 +350,24 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // copy output: register to global memory { constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; - constexpr index_t K0 = K / K1; // define tensor descriptor for threadwise copy // output memory layout descriptor in register, src of threadwise copy - constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed( + constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // output memory layout descriptor in device memory - constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor( - out_n_k_ho_wo_global_desc, - make_tuple(UnMerge>{}, - UnMerge>{}, - PassThrough{}, - PassThrough{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{})); + constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc = + out_n_k_h_w_global_desc.Fold(I1, Number{}).Fold(I0, Number{}, Number{}); // output merged global tensor descriptor, dst of threadwise copy - constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor( - out_n0_n1_n2_k0_k1_ho_wo_global_desc, - make_tuple(PassThrough{}, - PassThrough{}, - PassThrough{}, - Merge>{}, - PassThrough{}), - make_tuple(Sequence<3>{}, - Sequence<4>{}, - Sequence<1>{}, - Sequence<0, 5, 6>{}, - Sequence<2>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + constexpr auto out_k0_k1_n1_b_n2_global_merged_desc = + make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc, + Sequence<3>{}, + Sequence<4>{}, + Sequence<1>{}, + Sequence<0, 5, 6>{}, + Sequence<2>{}); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -411,31 +380,26 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col / N2; - ThreadwiseGenericTensorSliceCopy_v4r2::type, - 3, - 1, - 1>({0, 0, 0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - 0, - b_thread_data_on_global, - 0}) -#if 1 - .template Run -#else // tweaking - .template Run_optimized_dst_address_calculation -#endif - (p_out_thread, p_out_global); + ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< + decltype(out_k0_k1_n1_b_n2_thread_mem_desc), + decltype(out_k0_k1_n1_b_n2_global_merged_desc), + decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()), + arithmetic_sequence_gen<0, 5, 1>::type, + arithmetic_sequence_gen<0, 5, 1>::type, + 3, + 3, + 1, + 1>({0, 0, 0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + 0, + b_thread_data_on_global, + 0}) + .template Run( + p_out_thread, p_out_global); } } }; } // namespace ck -#endif +#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index e741a83c4..a547db7e3 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -1,25 +1,27 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor_deprecated.hpp" -#include "ConstantMergedTensorDescriptor_deprecated.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" +#include "blockwise_generic_tensor_slice_copy.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" -#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { // B = merge(N, Ho, Wo) template @@ -56,23 +58,27 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; - constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; + constexpr auto in_n_c_hi_wi_global_desc = + make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); + constexpr auto wei_k_c_y_x_global_desc = + make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides()); + constexpr auto out_n_k_ho_wo_global_desc = + make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides()); - constexpr index_t N = in_n_c_h_w_global_desc.GetLengths()[0]; - constexpr index_t C = in_n_c_h_w_global_desc.GetLengths()[1]; + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); - constexpr index_t K = out_n_k_h_w_global_desc.GetLengths()[1]; - constexpr index_t Ho = out_n_k_h_w_global_desc.GetLengths()[2]; - constexpr index_t Wo = out_n_k_h_w_global_desc.GetLengths()[3]; + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); - constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; - constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); + constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); constexpr index_t ConvStrideH = ConvStrides{}[0]; constexpr index_t ConvStrideW = ConvStrides{}[1]; @@ -97,65 +103,67 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_cluster_descriptor(Sequence{}); - const auto block_work_multi_id = - block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; // input tensor - // tensor descriptor in device memory [N, Ho, Wo] - constexpr auto in_n_ho_wo_global_desc = - in_n_c_h_w_global_desc.Extract(I0, I2, I3) - .StridedSlice(I1, Number{}, Number{}) - .StridedSlice(I2, Number{}, Number{}); - - // batch descritpor for device memory - constexpr auto in_c_y_x_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) - .Extract(Sequence<1, 2, 3>{}); - - // merged tensor descriptor in device memory [E, B], src of blockwise copy - constexpr auto in_e_b_global_desc = - make_ConstantMergedTensorDescriptor(in_c_y_x_global_desc.Embed(in_n_ho_wo_global_desc), - Sequence<0, 1, 2>{}, - Sequence<3, 4, 5>{}); - - // memory layout descriptor in LDS [E, B], dst of blockwise copy + // global mem + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); + + constexpr auto in_e_b_global_desc = transform_tensor_descriptor( + in_n_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, Merge>{}), + make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + // LDS mem // be careful of LDS alignment constexpr auto in_e_b_block_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_native_tensor_descriptor_packed(Sequence{}); // input blockwise copy - // slice a merged tensor, reorder and copy to a normal tensor - // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2_deprecated( + BlockwiseGenericTensorSliceCopy_v4( {0, b_block_data_on_global}, {0, 0}); // weight tensor - // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_e_k_global_desc = - wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + // global mem + constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); - // tensor descriptor in LDS, dst of blockwise copy + // LDS // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); @@ -165,23 +173,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, "GemmDataPerReadA alignment requirement is not satisfied"); - // operator for blockwise copy of weight into LDS - // slice a tensor, and copy it into another tensor - // this copy operator already have blockwise offset built-in + // weight blockwise copy auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2_deprecated( + BlockwiseGenericTensorSliceCopy_v4( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -247,14 +253,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // zero out threadwise output threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread); - const Float* p_wei_block_on_global = p_wei_global; - // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -285,9 +289,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -311,9 +315,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -336,15 +340,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // copy output: register to global memory { - constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; - constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; - - // define tensor descriptor for threadwise copy - // output global descriptor, for calculating origin of thread tensor - // in global memory - constexpr auto out_k_b_global_desc = make_ConstantMergedTensorDescriptor( - out_n_k_h_w_global_desc, Sequence<1>{}, Sequence<0, 2, 3>{}); - // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index const auto c_thread_mtx_on_block = @@ -356,46 +351,51 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col; - // This is a hack, because slicing a merged dimension is not supported yet. - // This should be replaced with logic above, once slicing a merged dimension support - // become available - // dst descriptor - constexpr auto out_k0_k1_b_global_desc = - make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number{}), - Sequence<1>{}, - Sequence<2>{}, - Sequence<0, 3, 4>{}); - - // src descriptor - constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - using OutThreadCopySliceLengths = - Sequence; - - auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< - decltype(out_k0_k1_b_thread_desc), - decltype(out_k0_k1_b_global_desc), - OutThreadCopySliceLengths, - arithmetic_sequence_gen<0, 3, 1>::type, - arithmetic_sequence_gen<0, 3, 1>::type, - 2, - 2, + // src descriptor + constexpr auto out_k0_k1_b0_b1_thread_desc = make_native_tensor_descriptor_packed( + Sequence{}); + + // dst descriptor + constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; + + constexpr index_t K0 = K / K1; + constexpr index_t B0 = B / B1; + + constexpr auto out_k_b_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(PassThrough{}, Merge>{}), + make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor( + out_k_b_global_desc, + make_tuple(UnMerge>{}, UnMerge>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + // output threadwise copy + ThreadwiseGenericTensorSliceCopy_v4r2< + decltype(out_k0_k1_b0_b1_thread_desc), + decltype(out_k0_k1_b0_b1_global_desc), + decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 4, 1>::type, + 3, OutThreadCopyDataPerAccess_B, - OutThreadCopyDataPerAccess_B>({0, 0, 0}, + OutThreadCopyDataPerAccess_B>({0, 0, 0, 0}, {k_thread_data_on_global / K1, k_thread_data_on_global % K1, - b_thread_data_on_global}); - - for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) - { - threadwise_out_copy - .template Run(p_out_thread, - p_out_global); - - threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); - threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); - } + b_thread_data_on_global / B1, + b_thread_data_on_global % B1}) +#if 1 + .template Run +#else // tweaking + .template Run_optimized_dst_address_calculation +#endif + (p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp similarity index 56% rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp index e93258682..8b3f8445d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp @@ -1,27 +1,25 @@ -#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP -#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP +#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_DEPRECATRD_HPP +#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_DEPRECATRD_HPP #include "common_header.hpp" -#include "tensor_descriptor.hpp" -#include "tensor_descriptor_helper.hpp" +#include "ConstantTensorDescriptor_deprecated.hpp" +#include "ConstantMergedTensorDescriptor_deprecated.hpp" #include "ConstantMatrixDescriptor.hpp" -#include "blockwise_generic_tensor_slice_copy.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" +#include "threadwise_generic_tensor_slice_copy_deprecated.hpp" namespace ck { // B = merge(N, Ho, Wo) template -struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer +struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated { __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, @@ -58,27 +56,23 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; + constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; - constexpr auto in_n_c_hi_wi_global_desc = - make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); - constexpr auto wei_k_c_y_x_global_desc = - make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides()); - constexpr auto out_n_k_ho_wo_global_desc = - make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides()); + constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; - constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); - constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); - constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); + constexpr index_t N = in_n_c_h_w_global_desc.GetLengths()[0]; + constexpr index_t C = in_n_c_h_w_global_desc.GetLengths()[1]; - constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); - constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); - constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); + constexpr index_t K = out_n_k_h_w_global_desc.GetLengths()[1]; + constexpr index_t Ho = out_n_k_h_w_global_desc.GetLengths()[2]; + constexpr index_t Wo = out_n_k_h_w_global_desc.GetLengths()[3]; - constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); - constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2]; + constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3]; constexpr index_t ConvStrideH = ConvStrides{}[0]; constexpr index_t ConvStrideW = ConvStrides{}[1]; @@ -103,67 +97,65 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; // input tensor - // global mem - constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( - in_n_c_hi_wi_global_desc, - make_tuple( - PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); - - constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( - in_n_c_hip_wip_global_desc, - make_tuple(PassThrough{}, - PassThrough{}, - Embed, Sequence>{}, - Embed, Sequence>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - - constexpr auto in_e_b_global_desc = transform_tensor_descriptor( - in_n_c_y_ho_x_wo_global_desc, - make_tuple(Merge>{}, Merge>{}), - make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - // LDS mem + // tensor descriptor in device memory [N, Ho, Wo] + constexpr auto in_n_ho_wo_global_desc = + in_n_c_h_w_global_desc.Extract(I0, I2, I3) + .StridedSlice(I1, Number{}, Number{}) + .StridedSlice(I2, Number{}, Number{}); + + // batch descritpor for device memory + constexpr auto in_c_y_x_global_desc = + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) + .Extract(Sequence<1, 2, 3>{}); + + // merged tensor descriptor in device memory [E, B], src of blockwise copy + constexpr auto in_e_b_global_desc = + make_ConstantMergedTensorDescriptor(in_c_y_x_global_desc.Embed(in_n_ho_wo_global_desc), + Sequence<0, 1, 2>{}, + Sequence<3, 4, 5>{}); + + // memory layout descriptor in LDS [E, B], dst of blockwise copy // be careful of LDS alignment constexpr auto in_e_b_block_desc = - make_native_tensor_descriptor_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); // input blockwise copy + // slice a merged tensor, reorder and copy to a normal tensor + // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v4( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, b_block_data_on_global}, {0, 0}); // weight tensor - // global mem - constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); + // tensor descriptor in device memory, src of blockwise copy + constexpr auto wei_e_k_global_desc = + wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); - // LDS + // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( + constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -173,21 +165,23 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, "GemmDataPerReadA alignment requirement is not satisfied"); - // weight blockwise copy + // operator for blockwise copy of weight into LDS + // slice a tensor, and copy it into another tensor + // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v4( + BlockwiseGenericTensorSliceCopy_v2_deprecated( {0, k_block_data_on_global}, {0, 0}); // GEMM definition @@ -253,12 +247,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // zero out threadwise output threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread); + const Float* p_wei_block_on_global = p_wei_global; + // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.template Run(p_in_global, + p_in_block_double); + blockwise_wei_copy.template Run(p_wei_global, + p_wei_block_double); } // LDS double buffer: main body @@ -289,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -315,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -340,6 +336,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // copy output: register to global memory { + constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; + constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; + + // define tensor descriptor for threadwise copy + // output global descriptor, for calculating origin of thread tensor + // in global memory + constexpr auto out_k_b_global_desc = make_ConstantMergedTensorDescriptor( + out_n_k_h_w_global_desc, Sequence<1>{}, Sequence<0, 2, 3>{}); + // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index const auto c_thread_mtx_on_block = @@ -351,51 +356,46 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col; - // src descriptor - constexpr auto out_k0_k1_b0_b1_thread_desc = make_native_tensor_descriptor_packed( - Sequence{}); - - // dst descriptor - constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; - constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster; - - constexpr index_t K0 = K / K1; - constexpr index_t B0 = B / B1; - - constexpr auto out_k_b_global_desc = transform_tensor_descriptor( - out_n_k_ho_wo_global_desc, - make_tuple(PassThrough{}, Merge>{}), - make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor( - out_k_b_global_desc, - make_tuple(UnMerge>{}, UnMerge>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); - - // output threadwise copy - ThreadwiseGenericTensorSliceCopy_v4r2< - decltype(out_k0_k1_b0_b1_thread_desc), - decltype(out_k0_k1_b0_b1_global_desc), - decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()), - arithmetic_sequence_gen<0, 4, 1>::type, - 3, + // This is a hack, because slicing a merged dimension is not supported yet. + // This should be replaced with logic above, once slicing a merged dimension support + // become available + // dst descriptor + constexpr auto out_k0_k1_b_global_desc = + make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number{}), + Sequence<1>{}, + Sequence<2>{}, + Sequence<0, 3, 4>{}); + + // src descriptor + constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed( + Sequence{}); + + using OutThreadCopySliceLengths = + Sequence; + + auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated< + decltype(out_k0_k1_b_thread_desc), + decltype(out_k0_k1_b_global_desc), + OutThreadCopySliceLengths, + arithmetic_sequence_gen<0, 3, 1>::type, + arithmetic_sequence_gen<0, 3, 1>::type, + 2, + 2, OutThreadCopyDataPerAccess_B, - OutThreadCopyDataPerAccess_B>({0, 0, 0, 0}, + OutThreadCopyDataPerAccess_B>({0, 0, 0}, {k_thread_data_on_global / K1, k_thread_data_on_global % K1, - b_thread_data_on_global / B1, - b_thread_data_on_global % B1}) -#if 1 - .template Run -#else // tweaking - .template Run_optimized_dst_address_calculation -#endif - (p_out_thread, p_out_global); + b_thread_data_on_global}); + + for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) + { + threadwise_out_copy + .template Run(p_out_thread, + p_out_global); + + threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); + threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); + } } } }; diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in index 337d12fa3..06368305d 100644 --- a/composable_kernel/include/utility/float_type.amd.hpp.in +++ b/composable_kernel/include/utility/float_type.amd.hpp.in @@ -1,8 +1,6 @@ #ifndef CK_FLOAT_TYPE_AMD_HPP #define CK_FLOAT_TYPE_AMD_HPP -#include "bfloat16_dev.hpp" - namespace ck { // For some reason, HIP compiler need this definition to generate optimal ISA diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 56420cfb7..d361db801 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -5,12 +5,14 @@ #include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" -template +template void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, const Tensor& in_nchw, WeiDesc, @@ -19,6 +21,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, Tensor& out_nkhw, ConvStrides, ConvDilations, + LeftPads, + RightPads, ck::index_t nrepeat) { using namespace ck; @@ -28,9 +32,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_nchw_desc = InDesc{}; - constexpr auto wei_kcyx_desc = WeiDesc{}; - constexpr auto out_nkhw_desc = OutDesc{}; + constexpr auto in_nchw_desc = + make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides()); + constexpr auto wei_kcyx_desc = + make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides()); + constexpr auto out_nkhw_desc = + make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides()); constexpr index_t N = out_nkhw_desc.GetLength(I0); constexpr index_t K = out_nkhw_desc.GetLength(I1); @@ -47,7 +54,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data + // BlockSize = 256, each thread hold 64 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -84,7 +91,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #elif 0 - // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data + // BlockSize = 64, each thread hold 64 data constexpr index_t BlockSize = 64; constexpr index_t BPerBlock = 8; @@ -120,7 +127,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 1 +#elif 0 // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data constexpr index_t BlockSize = 256; @@ -170,42 +177,48 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer< - GridSize, - BlockSize, - T, - decltype(in_nchw_desc), - decltype(wei_kcyx_desc), - decltype(out_nkhw_desc), - ConvStrides, - ConvDilations, - BPerBlock, - KPerBlock, - EPerBlock, - GemmNRepeat, - GemmMPerThreadSubC, - GemmNPerThreadSubC, - GemmMLevel0Cluster, - GemmNLevel0Cluster, - GemmMLevel1Cluster, - GemmNLevel1Cluster, - GemmKPerThreadLoop, - GemmDataPerReadA, - GemmDataPerReadB, - InBlockCopySubLengths_E_N1_B_N2, - InBlockCopyClusterLengths_E_N1_B_N2, - InBlockCopyThreadClusterArrangeOrder, - InBlockCopySrcAccessOrder, - InBlockCopyDstAccessOrder, - InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2, - WeiBlockCopySubLengths_E_K, - WeiBlockCopyClusterLengths_E_K, - WeiBlockCopyThreadClusterArrangeOrder, - WeiBlockCopySrcAccessOrder, - WeiBlockCopyDstAccessOrder, - WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>{}; +#if 0 + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded +#else + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer +#endif + {}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp similarity index 76% rename from driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp rename to driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp index b4dd0558f..5a47feb6e 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp @@ -3,27 +3,23 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" - -template -void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcyx, - OutDesc, - Tensor& out_nkhw, - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - ck::index_t nrepeat) +#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp" + +template +void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + ConvStrides, + ConvDilations, + ck::index_t nrepeat) { using namespace ck; @@ -32,12 +28,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_nchw_desc = - make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides()); - constexpr auto wei_kcyx_desc = - make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides()); - constexpr auto out_nkhw_desc = - make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides()); + constexpr auto in_nchw_desc = InDesc{}; + constexpr auto wei_kcyx_desc = WeiDesc{}; + constexpr auto out_nkhw_desc = OutDesc{}; constexpr index_t N = out_nkhw_desc.GetLength(I0); constexpr index_t K = out_nkhw_desc.GetLength(I1); @@ -54,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); #if 1 - // BlockSize = 256, each thread hold 64 data + // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; @@ -91,7 +84,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #elif 0 - // BlockSize = 64, each thread hold 64 data + // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data constexpr index_t BlockSize = 64; constexpr index_t BPerBlock = 8; @@ -127,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; -#elif 0 +#elif 1 // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data constexpr index_t BlockSize = 256; @@ -177,48 +170,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); constexpr auto gridwise_conv = -#if 0 - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded -#else - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer -#endif - {}; + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated< + GridSize, + BlockSize, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + BPerBlock, + KPerBlock, + EPerBlock, + GemmNRepeat, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + InBlockCopySubLengths_E_N1_B_N2, + InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2, + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K>{}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 98e5bf489..df8073917 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -5,14 +5,14 @@ #include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp" -using namespace ck; - template + class ConvDilations, + class LeftPads, + class RightPads> void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, const Tensor& in_nchw, WeiDesc, @@ -21,8 +21,12 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, Tensor& out_nkhw, ConvStrides, ConvDilations, + LeftPads, + RightPads, ck::index_t nrepeat) { + using namespace ck; + constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -164,7 +168,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, constexpr auto gridwise_conv = #if 0 - GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded #else GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer #endif @@ -176,6 +180,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc, decltype(out_nkhw_desc), ConvStrides, ConvDilations, + LeftPads, + RightPads, BPerBlock, KPerBlock, EPerBlock, diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp similarity index 92% rename from driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp rename to driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp index 86ba43d7d..cb51bfc1d 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp @@ -3,30 +3,26 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" +#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp" + +using namespace ck; template -void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc, - const Tensor& in_nchw, - WeiDesc, - const Tensor& wei_kcyx, - OutDesc, - Tensor& out_nkhw, - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - ck::index_t nrepeat) + class ConvDilations> +void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(InDesc, + const Tensor& in_nchw, + WeiDesc, + const Tensor& wei_kcyx, + OutDesc, + Tensor& out_nkhw, + ConvStrides, + ConvDilations, + ck::index_t nrepeat) { - using namespace ck; - constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; @@ -168,9 +164,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc, constexpr auto gridwise_conv = #if 0 - GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw #else - GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer + GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated #endif ; using RightPads = Sequence<2, 2>; -#elif 0 +#elif 1 // 7x1 filter, 3x0 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -439,6 +439,16 @@ int main(int argc, char* argv[]) device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); #elif 0 + device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -447,19 +457,9 @@ int main(int argc, char* argv[]) out_nkhw_device, ConvStrides{}, ConvDilations{}, + LeftPads{}, + RightPads{}, nrepeat); -#elif 1 - device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); #elif 0 device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, @@ -481,6 +481,16 @@ int main(int argc, char* argv[]) ConvDilations{}, nrepeat); #elif 0 + device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc, + in_nchw, + wei_kcyx_desc, + wei_kcyx, + out_nkhw_desc, + out_nkhw_device, + ConvStrides{}, + ConvDilations{}, + nrepeat); +#elif 1 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -489,19 +499,9 @@ int main(int argc, char* argv[]) out_nkhw_device, ConvStrides{}, ConvDilations{}, + LeftPads{}, + RightPads{}, nrepeat); -#elif 1 - device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc, - in_nchw, - wei_kcyx_desc, - wei_kcyx, - out_nkhw_desc, - out_nkhw_device, - ConvStrides{}, - ConvDilations{}, - LeftPads{}, - RightPads{}, - nrepeat); #endif if(do_verification) From 15fd8d22dd4d8ac7c3c9a6375f6e8707b37bb392 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 7 Oct 2019 10:31:24 -0500 Subject: [PATCH 10/20] update amd build script --- driver/src/driver.cpp | 4 ++-- script/compile-hip.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index f11183750..006d13bc6 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -326,7 +326,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<2, 2>; using RightPads = Sequence<2, 2>; -#elif 1 +#elif 0 // 7x1 filter, 3x0 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; diff --git a/script/compile-hip.sh b/script/compile-hip.sh index 5a61bc138..bae4d677e 100755 --- a/script/compile-hip.sh +++ b/script/compile-hip.sh @@ -1,8 +1,8 @@ #!/bin/bash + export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr" export KMDUMPISA=1 export KMDUMPLLVM=1 -#export KMOPTLLC="-mattr=+enable-ds128" - export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr" + export KMDUMPDIR=$PWD -make -j driver -/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm + make -j driver +#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm From 0afc27e9d13072ecc8324eaf7063dafeeb064b32 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 7 Oct 2019 10:47:50 -0500 Subject: [PATCH 11/20] miopen_integration --- .../kernel_algorithm/convolution_common.hpp | 14 ++ ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 72 ++++-- .../print_tensor_descriptor.hpp | 173 +++++++++++++ .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 14 +- .../tensor_coordinate_helper.hpp | 4 +- .../tensor_descriptor_helper.hpp | 236 +++--------------- ...e_generic_tensor_slice_copy_deprecated.hpp | 16 +- .../threadwise_generic_tensor_slice_copy.hpp | 8 - ...e_generic_tensor_slice_copy_deprecated.hpp | 62 ++--- .../include/utility/amd_inline_asm.hpp | 29 +++ .../include/utility/amd_intrinsic.hpp | 48 ++-- .../include/utility/config.amd.hpp.in | 21 +- .../include/utility/float_type.hpp | 108 ++++++++ .../include/utility/print_array.hpp | 177 +++++++++++++ .../include/utility/print_sequence.hpp | 46 ++++ .../include/utility/vector_type.hpp | 141 ++++++++++- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 83 +++--- driver/src/driver.cpp | 6 +- 19 files changed, 887 insertions(+), 375 deletions(-) create mode 100644 composable_kernel/include/kernel_algorithm/convolution_common.hpp create mode 100644 composable_kernel/include/tensor_description/print_tensor_descriptor.hpp create mode 100644 composable_kernel/include/utility/float_type.hpp create mode 100644 composable_kernel/include/utility/print_array.hpp create mode 100644 composable_kernel/include/utility/print_sequence.hpp diff --git a/composable_kernel/include/kernel_algorithm/convolution_common.hpp b/composable_kernel/include/kernel_algorithm/convolution_common.hpp new file mode 100644 index 000000000..4bcb3347a --- /dev/null +++ b/composable_kernel/include/kernel_algorithm/convolution_common.hpp @@ -0,0 +1,14 @@ +#ifndef CK_CONVOLUTION_COMMON_HPP +#define CK_CONVOLUTION_COMMON_HPP + +namespace ck { + +enum ConvolutionDirection +{ + Forward, + BackwardData, + BackwardWeight +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 724a042c9..f3a98d773 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -8,13 +8,14 @@ #include "blockwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" +#include "convolution_common.hpp" namespace ck { -// define B = merge(N0, Ho, Wo) template struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer { + template + struct make_wei_e_k_global_desc; + + template <> + struct make_wei_e_k_global_desc + { + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I1 = Number<1>{}; + constexpr auto I3 = Number<3>{}; + + return reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{}); + } + }; + + template <> + struct make_wei_e_k_global_desc + { + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto wei_k_c_y_x_global_desc = WeiDesc{}; + + constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0); + constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1); + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); + constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + + return transform_tensor_descriptor( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3), + make_tuple(Merge>{}, PassThrough{}), + make_tuple(Sequence<1, 2>{}, Sequence<0>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }; + __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const { + static_assert(ConvDirection == ConvolutionDirection::Forward || + ConvDirection == ConvolutionDirection::BackwardWeight, + "wrong! this kernel only support convolution forward and backward-weight"); + // this is a mess // TODO: find more elegent way of specifying (or calculating) performance parameters constexpr index_t N1 = GemmNRepeat; @@ -181,9 +230,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor - // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); + // Tensor descriptor in device memory, src of blockwise copy + // It is constructed differently, depending on whether forward or backward weight + // convolution + constexpr auto wei_e_k_global_desc = + make_wei_e_k_global_desc{}(wei_k_c_y_x_global_desc); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment @@ -274,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; + AccDataType p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); @@ -424,15 +475,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) -#if 1 - .template Run -#else // tweaking - .template Run_optimized_dst_address_calculation -#endif - (p_out_thread, p_out_global); + .template Run( + p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp b/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp new file mode 100644 index 000000000..89174e27b --- /dev/null +++ b/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp @@ -0,0 +1,173 @@ +#ifndef CK_PRINT_TENSOR_DESCRIPTOR_HPP +#define CK_PRINT_TENSOR_DESCRIPTOR_HPP + +#include "common_header.hpp" +#include "tensor_descriptor.hpp" + +namespace ck { + +template +__host__ __device__ void +print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) +{ + print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides()); +} + +template +__host__ __device__ void print_tensor_descriptor(const char* s, + const TransformedTensorDescriptor& desc) +{ + print_tensor_descriptor_impl(s, desc.GetLengths()); +} + +template +__host__ __device__ void +print_tensor_descriptor_impl(const char* s, Sequence, Sequence) +{ + constexpr index_t nDim = sizeof...(Lengths); + + static_assert(nDim > 0 && nDim <= 12, "wrong!"); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...); + }); + + static_if{}([&](auto) { + printf( + "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " + "%u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " + "%u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u " + "%u %u " + "%u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u " + "%u %u %u %u " + "%u %u %u}\n", + s, + nDim, + Lengths..., + Strides...); + }); +} + +template +__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence) +{ + constexpr index_t nDim = sizeof...(Lengths); + + static_assert(nDim > 0 && nDim <= 12, "wrong!"); + + static_if{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); }); + + static_if{}( + [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); + }); +} + +} // namespace ck + +#endif diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index ae7e58778..5ce5bc700 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -1,5 +1,5 @@ -#ifndef CK_TENSOR_COORDINATE_V2_HPP -#define CK_TENSOR_COORDINATE_V2_HPP +#ifndef CK_TENSOR_COORDINATE_HPP +#define CK_TENSOR_COORDINATE_HPP #include "common_header.hpp" #include "dimension.hpp" diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index aaddc1251..69659445a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -97,16 +97,17 @@ struct NormalTensorCoordinate_deprecated // TensorDesc is ConstantMergedTensorDescriptor_deprecated template -struct MergedTensorCoordinate +struct MergedTensorCoordinate_deprecated { - using type = MergedTensorCoordinate; + using type = MergedTensorCoordinate_deprecated; using tensor_desc_type = TensorDesc; static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); static constexpr index_t nOriginalDim = tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); - __host__ __device__ constexpr MergedTensorCoordinate(Array tensor_index) + __host__ + __device__ constexpr MergedTensorCoordinate_deprecated(Array tensor_index) : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} { // partial offset on each dimension @@ -127,8 +128,8 @@ struct MergedTensorCoordinate } template - __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs) - : MergedTensorCoordinate(Array{xs...}) + __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs) + : MergedTensorCoordinate_deprecated(Array{xs...}) { } @@ -335,7 +336,8 @@ struct TensorCoordinate_deprecated __host__ __device__ static constexpr auto MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { - return MergedTensorCoordinate>(); + return MergedTensorCoordinate_deprecated< + ConstantMergedTensorDescriptor_deprecated>(); } public: diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp index 93cb077c2..2cacb329c 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp @@ -1,13 +1,13 @@ #ifndef CK_TENSOR_COORDINATE_HELPER_HPP #define CK_TENSOR_COORDINATE_HELPER_HPP -#include "tensor_coordiante_v2.hpp" +#include "tensor_coordiante_hpp" namespace ck { template __host__ __device__ constexpr auto -make_tensor_coordinate_v2(TensorDesc, MultiIndex idx) +make_tensor_coordinate(TensorDesc, MultiIndex idx) { return typename TensorCoordinate::type(idx); } diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 65fe69850..d7ef38672 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -63,10 +63,11 @@ template -__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) +__host__ __device__ constexpr auto + reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -74,17 +75,40 @@ __host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDes Tuple...>>{}; } -template +// reorder a NativeTensorDescriptor +template +__host__ __device__ constexpr auto + reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) +{ + static_assert(is_valid_sequence_map{}, + "wrong! MapLower2Upper is not a valid map"); + + constexpr auto old_desc = NativeTensorDescriptor{}; + + static_assert(old_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!"); + + constexpr auto new_lengths = old_desc.GetLengths().ReorderGivenOld2New(MapLower2Upper{}); + constexpr auto new_strides = old_desc.GetStrides().ReorderGivenOld2New(MapLower2Upper{}); + + return make_native_tensor_descriptor(new_lengths, new_strides); +} + +// reorder a TransformedTensorDescriptor +template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper) + reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); - return reorder_tensor_descriptor_impl( - LowerTensorDescriptor{}, - LowerTensorDescriptor::GetLengths(), - typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{}, + constexpr auto low_desc = TransformedTensorDescriptor{}; + + static_assert(low_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!"); + + return reorder_transformed_tensor_descriptor_impl( + low_desc, + low_desc.GetLengths(), + typename arithmetic_sequence_gen<0, low_desc.GetNumOfDimension(), 1>::type{}, MapLower2Upper{}); } @@ -97,7 +121,7 @@ __host__ __device__ constexpr auto } template -__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides) +__host__ __device__ constexpr bool are_dimensions_unfoldable(Lengths, Strides) { static_assert(Lengths::Size() == Strides::Size(), "wrong!"); @@ -129,7 +153,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript constexpr auto right = typename arithmetic_sequence_gen::type{}; // sanity-checknfoldable - static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)), + static_assert(are_dimensions_unfoldable(desc.GetLengths(middle), desc.GetStrides(middle)), "wrong! not unfoldable"); // unfolded length, stride @@ -148,30 +172,6 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript return make_native_tensor_descriptor(new_lengths, new_strides); } -#if 0 -// not implemented -template -__host__ __device__ constexpr auto - pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads) -{ - constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension(); - - constexpr auto non_pad_low_dim_ids = xxx; - - return transform_tensor_descriptor( - LowerTensorDescriptor{}, - make_tuple(Pad{}) - .PushBack(PassThrough...), - make_tuple(PadLowerDimensionIds{}).PushBack(xxxx), - sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{})); -} -#endif - // a cluster map 1d index to N-d index template struct ClusterDescriptor @@ -205,169 +205,7 @@ template ::type{}) { - return ClusterDescriptor{}; -} - -template -__host__ __device__ void -print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) -{ - print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides()); -} - -template -__host__ __device__ void print_tensor_descriptor(const char* s, - const TransformedTensorDescriptor& desc) -{ - print_tensor_descriptor_impl(s, desc.GetLengths()); -} - -template -__host__ __device__ void -print_tensor_descriptor_impl(const char* s, Sequence, Sequence) -{ - constexpr index_t nDim = sizeof...(Lengths); - - static_assert(nDim > 0 && nDim <= 12, "wrong!"); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...); - }); - - static_if{}([&](auto) { - printf( - "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " - "%u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " - "%u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u " - "%u %u " - "%u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u " - "%u %u %u %u " - "%u %u %u}\n", - s, - nDim, - Lengths..., - Strides...); - }); -} - -template -__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence) -{ - constexpr index_t nDim = sizeof...(Lengths); - - static_assert(nDim > 0 && nDim <= 12, "wrong!"); - - static_if{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); }); - - static_if{}( - [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); - }); - - static_if{}([&](auto) { - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...); - }); + return ClusterDescriptor{}; } } // namespace ck diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index c922384a9..399a47407 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -193,14 +193,14 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); } - __device__ static constexpr index_t GetRegisterBufferSize() + __device__ static constexpr index_t GetThreadBufferSize() { return GetRegisterBufferDescriptor().GetElementSpace(); } template - __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src, - TData* __restrict__ p_buffer) const + __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src, + TData* __restrict__ p_buffer) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -255,8 +255,8 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated } template - __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer, - TData* __restrict__ p_dst) const + __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer, + TData* __restrict__ p_dst) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated template __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const { - TData p_buffer[GetRegisterBufferSize()]; + TData p_buffer[GetThreadBufferSize()]; - RunLoadRegisterBuffer(p_src, p_buffer); - RunStoreRegisterBuffer(p_buffer, p_dst); + RunLoadThreadBuffer(p_src, p_buffer); + RunStoreThreadBuffer(p_buffer, p_dst); } // When moving the slicing windows along a merged dimension, if the strides of the diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 6a61c2c05..378473e1f 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -6,14 +6,6 @@ #include "tensor_descriptor_helper.hpp" #include "tensor_coordinate.hpp" -#ifndef CK_USE_AMD_INTRINSIC -#define CK_USE_AMD_INTRINSIC 1 -#endif - -#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC -#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1 -#endif - namespace ck { // This version use multi-index transformation diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index 0310addd3..c70929f3f 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -80,11 +80,11 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated mDstSliceOrigin = dst_slice_origin; } - template - __device__ void Run(const TData* p_src, TData* p_dst) const + template + __device__ void Run(const SrcData* p_src, DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; constexpr auto vector_access_dim = Number{}; @@ -96,46 +96,6 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 - static_ford{}([&]( - auto long_vector_access_id) { - - // data id w.r.t slicing-window - constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( - vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); - - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; - - // load data from src to the long-vector buffer - static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) { - constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( - vector_access_dim, i * src_data_per_access); - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( - mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - constexpr index_t buffer_offset = i * src_data_per_access; - - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - }); - - // store data from the long-vector buffer to dst - static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { - constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( - vector_access_dim, i * dst_data_per_access); - - constexpr index_t buffer_offset = i * dst_data_per_access; - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( - mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); - }); - }); -#else ford{}( [&](auto long_vector_access_id) { @@ -145,7 +105,8 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated long_vector_size * long_vector_access_id[vector_access_dim]; // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + SrcData p_src_long_vector[long_vector_size]; + DstData p_dst_long_vector[long_vector_size]; // load data from src to the long-vector buffer for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) @@ -158,10 +119,16 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated const index_t buffer_offset = i * src_data_per_access; - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_offset]); } + // type conversion + for(index_t i = 0; i < long_vector_size; ++i) + { + p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); + } + // store data from the long-vector buffer to dst for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) { @@ -174,10 +141,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); } }); -#endif } private: diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index c764b27d2..006659710 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -77,6 +77,35 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0 "1"(c1)); // 3rd Src Acc registers for 2 half2 registers } +// outer-product: c[i,j] += inner_product(a[i], b[j]) +__device__ void __outer_product_1x4(half2_t a, + half2_t b0, + half2_t b1, + half2_t b2, + half2_t b3, + float& c0, + float& c1, + float& c2, + float& c3) +{ + asm volatile("\n \ + v_dot2_f32_f16 %0, %4, %5 %0\n \ + v_dot2_f32_f16 %1, %4, %6 %1\n \ + v_dot2_f32_f16 %2, %4, %7 %2\n \ + v_dot2_f32_f16 %3, %4, %8 %3\n \ + " + : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers + : "v"(a), // 1st Src register for 1 half2 registers + "v"(b0), // 2nd Src register + "v"(b1), + "v"(b2), + "v"(b3), + "0"(c0), // 3rd Src register + "1"(c1), + "2"(c2), + "3"(c3)); +} + // outer-product: c[i,j] += inner_product(a[i], b[j]) __device__ void __outer_product_1x4(half4_t a, half4_t b0, diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp index a5bbd8782..d161edd98 100644 --- a/composable_kernel/include/utility/amd_intrinsic.hpp +++ b/composable_kernel/include/utility/amd_intrinsic.hpp @@ -70,9 +70,9 @@ __device__ float __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ @@ -92,9 +92,9 @@ __device__ float __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; dst = __llvm_amdgcn_buffer_load( src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); @@ -118,9 +118,9 @@ __device__ float2_t __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ @@ -140,9 +140,9 @@ __device__ float2_t __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; dst = __llvm_amdgcn_buffer_loadx2( src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); @@ -166,9 +166,9 @@ __device__ float4_t __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ @@ -188,9 +188,9 @@ __device__ float4_t __buffer_load(const float* p_src_block, // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + reinterpret_cast(&src_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + reinterpret_cast(&src_block_setting)[3] = 0x00027000; dst = __llvm_amdgcn_buffer_loadx4( src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); @@ -213,9 +213,9 @@ __device__ void __buffer_store(const float& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ @@ -233,9 +233,9 @@ __device__ void __buffer_store(const float& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; __llvm_amdgcn_buffer_store( src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); @@ -256,9 +256,9 @@ __device__ void __buffer_store(const float2_t& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ @@ -276,9 +276,9 @@ __device__ void __buffer_store(const float2_t& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; __llvm_amdgcn_buffer_storex2( src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); @@ -299,9 +299,9 @@ __device__ void __buffer_store(const float4_t& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; asm volatile("\n \ buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ @@ -319,9 +319,9 @@ __device__ void __buffer_store(const float4_t& src, // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + reinterpret_cast(&dst_block_setting)[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; __llvm_amdgcn_buffer_storex4( src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 1da362b81..beb9e083b 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -20,15 +20,6 @@ #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 #endif -// AMD XDLOPS -#ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 1 -#endif - -#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM -#define CK_USE_AMD_XDLOPS_INLINE_ASM 1 -#endif - // AMD llvm intrinsic #ifndef CK_USE_AMD_INTRINSIC #define CK_USE_AMD_INTRINSIC 1 @@ -38,10 +29,18 @@ #define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1 #endif +// AMD XDLOPS +#ifndef CK_USE_AMD_XDLOPS +#define CK_USE_AMD_XDLOPS 1 +#endif + +#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM +#define CK_USE_AMD_XDLOPS_INLINE_ASM 1 +#endif + // experimental implementation #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 +#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 diff --git a/composable_kernel/include/utility/float_type.hpp b/composable_kernel/include/utility/float_type.hpp new file mode 100644 index 000000000..06368305d --- /dev/null +++ b/composable_kernel/include/utility/float_type.hpp @@ -0,0 +1,108 @@ +#ifndef CK_FLOAT_TYPE_AMD_HPP +#define CK_FLOAT_TYPE_AMD_HPP + +namespace ck { + +// For some reason, HIP compiler need this definition to generate optimal ISA +// float +typedef float float2_t __attribute__((ext_vector_type(2))); +typedef float float4_t __attribute__((ext_vector_type(4))); +typedef float float32_t __attribute__((ext_vector_type(32))); + +// float16 +typedef _Float16 half2_t __attribute__((ext_vector_type(2))); +typedef _Float16 half4_t __attribute__((ext_vector_type(4))); + +// bfloat16 +typedef ushort ushort2_t __attribute__((ext_vector_type(2))); +typedef ushort ushort4_t __attribute__((ext_vector_type(4))); + +// data type conversion +template +struct type_convert +{ + template + __device__ T operator()(X x) const + { + return static_cast(x); + } +}; + +template <> +template <> +__device__ float type_convert::operator()(ushort x) const +{ + return bfloat16_to_float(x); +} + +template <> +template <> +__device__ ushort type_convert::operator()(float x) const +{ + return float_to_bfloat16(x); +} + +template +struct inner_product_with_conversion +{ + static constexpr auto convert = type_convert(); + + __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } + + __device__ T operator()(half2_t a, half2_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + + return acc; + } + + __device__ T operator()(half4_t a, half4_t b) const + { + const half* p_a_half = reinterpret_cast(&a); + const half* p_b_half = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_half[v]) * convert(p_b_half[v]); + } + return acc; + } + + __device__ T operator()(ushort2_t a, ushort2_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 2; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + + return acc; + } + + __device__ T operator()(ushort4_t a, ushort4_t b) const + { + const ushort* p_a_bfloat16 = reinterpret_cast(&a); + const ushort* p_b_bfloat16 = reinterpret_cast(&b); + + T acc = 0; + for(index_t v = 0; v < 4; ++v) + { + acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); + } + return acc; + } +}; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/print_array.hpp b/composable_kernel/include/utility/print_array.hpp new file mode 100644 index 000000000..34769af2f --- /dev/null +++ b/composable_kernel/include/utility/print_array.hpp @@ -0,0 +1,177 @@ +#ifndef CK_ARRAY_HELPER_HPP +#define CK_ARRAY_HELPER_HPP + +#include "array.hpp" + +namespace ck { + +template +__host__ __device__ void print_array(const char* s, Array a) +{ + constexpr index_t nsize = a.GetSize(); + + static_assert(nsize > 0 && nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8]); + }); + + static_if{}([&](auto) { + printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8], + a[9]); + }); +} + +template +__host__ __device__ void print_array(const char* s, Array a) +{ + constexpr index_t nsize = a.GetSize(); + + static_assert(nsize > 0 && nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); }); + + static_if{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); }); + + static_if{}( + [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); }); + + static_if{}( + [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8], + a[9]); + }); +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/print_sequence.hpp b/composable_kernel/include/utility/print_sequence.hpp new file mode 100644 index 000000000..71abfea1f --- /dev/null +++ b/composable_kernel/include/utility/print_sequence.hpp @@ -0,0 +1,46 @@ +#ifndef CK_SEQUENCE_HELPER_HPP +#define CK_SEQUENCE_HELPER_HPP + +#include "sequence.hpp" + +namespace ck { + +template +__host__ __device__ void print_sequence(const char* s, Sequence) +{ + constexpr index_t nsize = Sequence::Size(); + + static_assert(nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); }); + + static_if{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); + + static_if{}( + [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); +} + +} // namespace ck + +#endif diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp index 01c6539b2..e9b3fe36d 100644 --- a/composable_kernel/include/utility/vector_type.hpp +++ b/composable_kernel/include/utility/vector_type.hpp @@ -9,6 +9,10 @@ namespace ck { template struct vector_type { + typedef struct + { + T scalar[N]; + } MemoryType; }; template <> @@ -29,7 +33,7 @@ struct vector_type { using MemoryType = float2_t; - union Data + union DataType { MemoryType vector; float scalar[2]; @@ -44,7 +48,7 @@ struct vector_type __host__ __device__ static MemoryType Pack(float s0, float s1) { - Data data; + DataType data; data.scalar[0] = s0; data.scalar[1] = s1; return data.vector; @@ -56,6 +60,8 @@ struct vector_type { using MemoryType = float4_t; + __host__ __device__ static constexpr index_t GetSize() { return 4; } + template __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) { @@ -65,23 +71,142 @@ struct vector_type }; template <> -struct vector_type +struct vector_type { - using MemoryType = const float; + using MemoryType = half; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } }; template <> -struct vector_type +struct vector_type { - using MemoryType = const float2_t; + using MemoryType = half2_t; + + union DataType + { + MemoryType vector; + half scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(half s0, half s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } }; template <> -struct vector_type +struct vector_type { - using MemoryType = const float4_t; + using MemoryType = half4_t; + + union DataType + { + MemoryType vector; + half scalar[4]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } }; +template <> +struct vector_type +{ + using MemoryType = ushort; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = ushort2_t; + + union DataType + { + MemoryType vector; + ushort scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(ushort s0, ushort s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = ushort4_t; + + union DataType + { + MemoryType vector; + ushort scalar[4]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } +}; } // namespace ck #endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index d361db801..ccff9e725 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -3,6 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" +#include "convolution_common.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" template {}; + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer< + GridSize, + BlockSize, + T, + T, + decltype(in_nchw_desc), + decltype(wei_kcyx_desc), + decltype(out_nkhw_desc), + ConvStrides, + ConvDilations, + LeftPads, + RightPads, + ConvolutionDirection::Forward, + BPerBlock, + KPerBlock, + EPerBlock, + GemmNRepeat, + GemmMPerThreadSubC, + GemmNPerThreadSubC, + GemmMLevel0Cluster, + GemmNLevel0Cluster, + GemmMLevel1Cluster, + GemmNLevel1Cluster, + GemmKPerThreadLoop, + GemmDataPerReadA, + GemmDataPerReadB, + InBlockCopySubLengths_E_N1_B_N2, + InBlockCopyClusterLengths_E_N1_B_N2, + InBlockCopyThreadClusterArrangeOrder, + InBlockCopySrcAccessOrder, + InBlockCopyDstAccessOrder, + InBlockCopySrcDataPerRead_B, + InBlockCopyDstDataPerWrite_N2, + WeiBlockCopySubLengths_E_K, + WeiBlockCopyClusterLengths_E_K, + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, + WeiBlockCopySrcDataPerRead_E, + WeiBlockCopyDstDataPerWrite_K>{}; for(index_t i = 0; i < nrepeat; ++i) { diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 006d13bc6..957134842 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -438,7 +438,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, From 906b384018d980cb13257852f03614404dcf1529 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 7 Oct 2019 11:04:58 -0500 Subject: [PATCH 12/20] refactor for nvidia build --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 88 ++++---- ...kcyx_nkhw_lds_double_buffer_deprecated.hpp | 1 - .../ConstantMatrixDescriptor.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_descriptor_helper.hpp | 12 +- .../include/utility/common_header.hpp | 3 +- .../include/utility/float_type.amd.hpp.in | 202 +++++++++++++++++ .../include/utility/float_type.hpp | 108 --------- .../include/utility/float_type.nvidia.hpp.in | 106 +++++++++ .../include/utility/vector_type.hpp | 212 ------------------ driver/src/driver.cpp | 2 +- 12 files changed, 365 insertions(+), 379 deletions(-) delete mode 100644 composable_kernel/include/utility/float_type.hpp delete mode 100644 composable_kernel/include/utility/vector_type.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index f3a98d773..09d275913 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -12,6 +12,49 @@ namespace ck { +template +struct make_wei_e_k_global_desc_v4r1; + +template <> +struct make_wei_e_k_global_desc_v4r1 +{ + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I1 = Number<1>{}; + constexpr auto I3 = Number<3>{}; + + return reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{}); + } +}; + +template <> +struct make_wei_e_k_global_desc_v4r1 +{ + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto wei_k_c_y_x_global_desc = WeiDesc{}; + + constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0); + constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1); + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); + constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + + return transform_tensor_descriptor( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3), + make_tuple(Merge>{}, PassThrough{}), + make_tuple(Sequence<1, 2>{}, Sequence<0>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } +}; + template struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer { - template - struct make_wei_e_k_global_desc; - - template <> - struct make_wei_e_k_global_desc - { - template - __device__ constexpr auto operator()(WeiDesc) const - { - constexpr auto I1 = Number<1>{}; - constexpr auto I3 = Number<3>{}; - - return reorder_tensor_descriptor_given_upper2lower( - unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{}); - } - }; - - template <> - struct make_wei_e_k_global_desc - { - template - __device__ constexpr auto operator()(WeiDesc) const - { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto wei_k_c_y_x_global_desc = WeiDesc{}; - - constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0); - constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1); - constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); - constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); - - return transform_tensor_descriptor( - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3), - make_tuple(Merge>{}, PassThrough{}), - make_tuple(Sequence<1, 2>{}, Sequence<0>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - }; - __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const @@ -234,7 +234,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // It is constructed differently, depending on whether forward or backward weight // convolution constexpr auto wei_e_k_global_desc = - make_wei_e_k_global_desc{}(wei_k_c_y_x_global_desc); + make_wei_e_k_global_desc_v4r1{}(wei_k_c_y_x_global_desc); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp index 267e8e0a6..db92631a3 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp @@ -67,7 +67,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index 0ebd9dc4a..e2a5836ed 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -60,7 +60,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto - make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) +make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 5ce5bc700..4b3a60c67 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -215,7 +215,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -223,7 +223,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index 69659445a..da02abdd5 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index d7ef38672..1597e4c57 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -64,10 +64,10 @@ template __host__ __device__ constexpr auto - reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) +reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ __host__ __device__ constexpr auto // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ __host__ __device__ constexpr auto // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 441eecae9..20584c335 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -2,14 +2,13 @@ #define CK_COMMON_HEADER_HPP #include "config.hpp" -#include "float_type.hpp" #include "utility.hpp" #include "integral_constant.hpp" #include "number.hpp" +#include "float_type.hpp" #include "type.hpp" #include "tuple.hpp" #include "math.hpp" -#include "vector_type.hpp" #include "sequence.hpp" #include "sequence_helper.hpp" #include "array.hpp" diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in index 06368305d..537d17daf 100644 --- a/composable_kernel/include/utility/float_type.amd.hpp.in +++ b/composable_kernel/include/utility/float_type.amd.hpp.in @@ -17,6 +17,208 @@ typedef _Float16 half4_t __attribute__((ext_vector_type(4))); typedef ushort ushort2_t __attribute__((ext_vector_type(2))); typedef ushort ushort4_t __attribute__((ext_vector_type(4))); +template +struct vector_type +{ + typedef struct + { + T scalar[N]; + } MemoryType; +}; + +template <> +struct vector_type +{ + using MemoryType = float; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float2_t; + + union DataType + { + MemoryType vector; + float scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(float s0, float s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float4_t; + + __host__ __device__ static constexpr index_t GetSize() { return 4; } + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = half; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = half2_t; + + union DataType + { + MemoryType vector; + half scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(half s0, half s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = half4_t; + + union DataType + { + MemoryType vector; + half scalar[4]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = ushort; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = ushort2_t; + + union DataType + { + MemoryType vector; + ushort scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(ushort s0, ushort s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = ushort4_t; + + union DataType + { + MemoryType vector; + ushort scalar[4]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + data.scalar[2] = s2; + data.scalar[3] = s3; + return data.vector; + } +}; + // data type conversion template struct type_convert diff --git a/composable_kernel/include/utility/float_type.hpp b/composable_kernel/include/utility/float_type.hpp deleted file mode 100644 index 06368305d..000000000 --- a/composable_kernel/include/utility/float_type.hpp +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef CK_FLOAT_TYPE_AMD_HPP -#define CK_FLOAT_TYPE_AMD_HPP - -namespace ck { - -// For some reason, HIP compiler need this definition to generate optimal ISA -// float -typedef float float2_t __attribute__((ext_vector_type(2))); -typedef float float4_t __attribute__((ext_vector_type(4))); -typedef float float32_t __attribute__((ext_vector_type(32))); - -// float16 -typedef _Float16 half2_t __attribute__((ext_vector_type(2))); -typedef _Float16 half4_t __attribute__((ext_vector_type(4))); - -// bfloat16 -typedef ushort ushort2_t __attribute__((ext_vector_type(2))); -typedef ushort ushort4_t __attribute__((ext_vector_type(4))); - -// data type conversion -template -struct type_convert -{ - template - __device__ T operator()(X x) const - { - return static_cast(x); - } -}; - -template <> -template <> -__device__ float type_convert::operator()(ushort x) const -{ - return bfloat16_to_float(x); -} - -template <> -template <> -__device__ ushort type_convert::operator()(float x) const -{ - return float_to_bfloat16(x); -} - -template -struct inner_product_with_conversion -{ - static constexpr auto convert = type_convert(); - - __device__ T operator()(float a, float b) const { return convert(a) * convert(b); } - - __device__ T operator()(half2_t a, half2_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - - return acc; - } - - __device__ T operator()(half4_t a, half4_t b) const - { - const half* p_a_half = reinterpret_cast(&a); - const half* p_b_half = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_half[v]) * convert(p_b_half[v]); - } - return acc; - } - - __device__ T operator()(ushort2_t a, ushort2_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 2; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - - return acc; - } - - __device__ T operator()(ushort4_t a, ushort4_t b) const - { - const ushort* p_a_bfloat16 = reinterpret_cast(&a); - const ushort* p_b_bfloat16 = reinterpret_cast(&b); - - T acc = 0; - for(index_t v = 0; v < 4; ++v) - { - acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]); - } - return acc; - } -}; - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/float_type.nvidia.hpp.in b/composable_kernel/include/utility/float_type.nvidia.hpp.in index fbb93a437..8be8c704a 100644 --- a/composable_kernel/include/utility/float_type.nvidia.hpp.in +++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in @@ -1,6 +1,8 @@ #ifndef CK_FLOAT_TYPE_NVIDIA_HPP #define CK_FLOAT_TYPE_NVIDIA_HPP +#include "number.hpp" + namespace ck { // For some reason, CUDA need this definition, otherwise @@ -14,6 +16,110 @@ using float4_t = float4; // float16 using half2_t = half2; +template +struct vector_type +{ + typedef struct + { + T scalar[N]; + } MemoryType; +}; + +template <> +struct vector_type +{ + using MemoryType = float; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float2_t; + + union DataType + { + MemoryType vector; + float scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(float s0, float s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + +template <> +struct vector_type +{ + using MemoryType = float4_t; + + __host__ __device__ static constexpr index_t GetSize() { return 4; } + + template + __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) + { + static_assert(I < 4, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = half; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 1, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } +}; + +template <> +struct vector_type +{ + using MemoryType = half2_t; + + union DataType + { + MemoryType vector; + half scalar[2]; + }; + + template + __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) + { + static_assert(I < 2, "wrong"); + *(reinterpret_cast(&v) + I) = s; + } + + __host__ __device__ static MemoryType Pack(half s0, half s1) + { + DataType data; + data.scalar[0] = s0; + data.scalar[1] = s1; + return data.vector; + } +}; + // data type conversion template struct type_convert diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp deleted file mode 100644 index e9b3fe36d..000000000 --- a/composable_kernel/include/utility/vector_type.hpp +++ /dev/null @@ -1,212 +0,0 @@ -#ifndef CK_VECTOR_TYPE_HPP -#define CK_VECTOR_TYPE_HPP - -#include "config.hpp" -#include "integral_constant.hpp" - -namespace ck { - -template -struct vector_type -{ - typedef struct - { - T scalar[N]; - } MemoryType; -}; - -template <> -struct vector_type -{ - using MemoryType = float; - - template - __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) - { - static_assert(I < 1, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } -}; - -template <> -struct vector_type -{ - using MemoryType = float2_t; - - union DataType - { - MemoryType vector; - float scalar[2]; - }; - - template - __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) - { - static_assert(I < 2, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } - - __host__ __device__ static MemoryType Pack(float s0, float s1) - { - DataType data; - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = float4_t; - - __host__ __device__ static constexpr index_t GetSize() { return 4; } - - template - __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) - { - static_assert(I < 4, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } -}; - -template <> -struct vector_type -{ - using MemoryType = half; - - template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) - { - static_assert(I < 1, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } -}; - -template <> -struct vector_type -{ - using MemoryType = half2_t; - - union DataType - { - MemoryType vector; - half scalar[2]; - }; - - template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) - { - static_assert(I < 2, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } - - __host__ __device__ static MemoryType Pack(half s0, half s1) - { - DataType data; - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = half4_t; - - union DataType - { - MemoryType vector; - half scalar[4]; - }; - - template - __host__ __device__ static void SetScalar(MemoryType& v, half s, Number) - { - static_assert(I < 4, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } - - __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3) - { - DataType data; - data.scalar[0] = s0; - data.scalar[1] = s1; - data.scalar[2] = s2; - data.scalar[3] = s3; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = ushort; - - template - __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) - { - static_assert(I < 1, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } -}; - -template <> -struct vector_type -{ - using MemoryType = ushort2_t; - - union DataType - { - MemoryType vector; - ushort scalar[2]; - }; - - template - __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) - { - static_assert(I < 2, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } - - __host__ __device__ static MemoryType Pack(ushort s0, ushort s1) - { - DataType data; - data.scalar[0] = s0; - data.scalar[1] = s1; - return data.vector; - } -}; - -template <> -struct vector_type -{ - using MemoryType = ushort4_t; - - union DataType - { - MemoryType vector; - ushort scalar[4]; - }; - - template - __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number) - { - static_assert(I < 4, "wrong"); - *(reinterpret_cast(&v) + I) = s; - } - - __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3) - { - DataType data; - data.scalar[0] = s0; - data.scalar[1] = s1; - data.scalar[2] = s2; - data.scalar[3] = s3; - return data.vector; - } -}; -} // namespace ck - -#endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 957134842..1a819e112 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -438,7 +438,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, From 18a2c5cb87b92103281e660220f3c581c8b78145 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 7 Oct 2019 15:37:33 -0500 Subject: [PATCH 13/20] refactor --- .../ConstantMatrixDescriptor.hpp | 2 +- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_deprecated.hpp | 4 +- .../tensor_descriptor_helper.hpp | 12 +- .../include/utility/amd_inline_asm.hpp | 2 +- .../include/utility/amd_intrinsic.hpp | 184 +++++++++++------- .../include/utility/array_helper.hpp | 177 ----------------- .../include/utility/common_header.hpp | 2 - .../include/utility/config.amd.hpp.in | 2 +- .../include/utility/print_array.hpp | 4 +- .../include/utility/print_sequence.hpp | 4 +- .../include/utility/sequence_helper.hpp | 46 ----- driver/src/driver.cpp | 2 + 13 files changed, 129 insertions(+), 316 deletions(-) delete mode 100644 composable_kernel/include/utility/array_helper.hpp delete mode 100644 composable_kernel/include/utility/sequence_helper.hpp diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index e2a5836ed..0ebd9dc4a 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -60,7 +60,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto -make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) + make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 4b3a60c67..5ce5bc700 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -215,7 +215,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -223,7 +223,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp index da02abdd5..69659445a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 1597e4c57..d7ef38672 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -64,10 +64,10 @@ template __host__ __device__ constexpr auto -reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) + reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto -reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) + reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLo // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto -reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) + reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 006659710..28eaf1f44 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -1,7 +1,7 @@ #ifndef CK_AMD_INLINE_ASM_HPP #define CK_AMD_INLINE_ASM_HPP -#include "vector_type.hpp" +#include "float_type.hpp" namespace ck { diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp index d161edd98..2575cbc40 100644 --- a/composable_kernel/include/utility/amd_intrinsic.hpp +++ b/composable_kernel/include/utility/amd_intrinsic.hpp @@ -1,10 +1,19 @@ #ifndef CK_AMD_INTRINSIC_HPP #define CK_AMD_INTRINSIC_HPP -#include "vector_type.hpp" +#include "float_type.hpp" namespace ck { +// for buffer_load and buffer_store +template +union BufferLoadStoreDwordConfig +{ + int32x4_t data; + T* address[2]; + int32_t range[4]; +}; + __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, index_t vindex, index_t offset, @@ -66,20 +75,22 @@ __device__ float __buffer_load(const float* p_src_block, index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; - asm volatile("\n \ + asm volatile( + "\n \ buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); return dst; #else @@ -88,16 +99,17 @@ __device__ float __buffer_load(const float* p_src_block, index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; dst = __llvm_amdgcn_buffer_load( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); return dst; #endif @@ -114,20 +126,22 @@ __device__ float2_t __buffer_load(const float* p_src_block, index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; - asm volatile("\n \ + asm volatile( + "\n \ buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); return dst; #else @@ -136,16 +150,17 @@ __device__ float2_t __buffer_load(const float* p_src_block, index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; dst = __llvm_amdgcn_buffer_loadx2( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); return dst; #endif @@ -162,38 +177,41 @@ __device__ float4_t __buffer_load(const float* p_src_block, index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; - asm volatile("\n \ + asm volatile( + "\n \ buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ s_waitcnt 0 \n \ " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); return dst; -#elif 1 +#else float4_t dst; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - int32x4_t src_block_setting{0}; + BufferLoadStoreDwordConfig src_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + src_block_config.address[0] = const_cast(p_src_block); // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; + src_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; + src_block_config.range[3] = 0x00027000; dst = __llvm_amdgcn_buffer_loadx4( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); return dst; #endif @@ -209,19 +227,20 @@ __device__ void __buffer_store(const float& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + dst_block_config.range[3] = 0x00027000; asm volatile("\n \ buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ " : - : "s"(dst_block_setting), + : "s"(dst_block_config.data), "v"(src), "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); @@ -229,16 +248,21 @@ __device__ void __buffer_store(const float& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_store( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); + dst_block_config.range[3] = 0x00027000; + + __llvm_amdgcn_buffer_store(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); #endif } @@ -252,19 +276,20 @@ __device__ void __buffer_store(const float2_t& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + dst_block_config.range[3] = 0x00027000; asm volatile("\n \ buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ " : - : "s"(dst_block_setting), + : "s"(dst_block_config.data), "v"(src), "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); @@ -272,16 +297,21 @@ __device__ void __buffer_store(const float2_t& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex2( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); + dst_block_config.range[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex2(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); #endif } @@ -295,19 +325,20 @@ __device__ void __buffer_store(const float4_t& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + dst_block_config.range[3] = 0x00027000; asm volatile("\n \ buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ " : - : "s"(dst_block_setting), + : "s"(dst_block_config.data), "v"(src), "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); @@ -315,16 +346,21 @@ __device__ void __buffer_store(const float4_t& src, index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - int32x4_t dst_block_setting{0}; + BufferLoadStoreDwordConfig dst_block_config; + // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; + dst_block_config.address[0] = p_dst_block; // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; + dst_block_config.range[2] = -1; // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex4( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); + dst_block_config.range[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex4(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); #endif } diff --git a/composable_kernel/include/utility/array_helper.hpp b/composable_kernel/include/utility/array_helper.hpp deleted file mode 100644 index 34769af2f..000000000 --- a/composable_kernel/include/utility/array_helper.hpp +++ /dev/null @@ -1,177 +0,0 @@ -#ifndef CK_ARRAY_HELPER_HPP -#define CK_ARRAY_HELPER_HPP - -#include "array.hpp" - -namespace ck { - -template -__host__ __device__ void print_array(const char* s, Array a) -{ - constexpr index_t nsize = a.GetSize(); - - static_assert(nsize > 0 && nsize <= 10, "wrong!"); - - static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); }); - - static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); - }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); - }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u %u %u}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6]); - }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7]); - }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7], - a[8]); - }); - - static_if{}([&](auto) { - printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7], - a[8], - a[9]); - }); -} - -template -__host__ __device__ void print_array(const char* s, Array a) -{ - constexpr index_t nsize = a.GetSize(); - - static_assert(nsize > 0 && nsize <= 10, "wrong!"); - - static_if{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); }); - - static_if{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); }); - - static_if{}( - [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); }); - - static_if{}( - [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); - }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); - }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d %d %d}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6]); - }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d %d %d %d}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7]); - }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7], - a[8]); - }); - - static_if{}([&](auto) { - printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n", - s, - nsize, - a[0], - a[1], - a[2], - a[3], - a[4], - a[5], - a[6], - a[7], - a[8], - a[9]); - }); -} - -} // namespace ck -#endif diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 20584c335..e01ec6efc 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -10,9 +10,7 @@ #include "tuple.hpp" #include "math.hpp" #include "sequence.hpp" -#include "sequence_helper.hpp" #include "array.hpp" -#include "array_helper.hpp" #include "functional.hpp" #include "functional2.hpp" #include "functional3.hpp" diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index beb9e083b..7800f5293 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -31,7 +31,7 @@ // AMD XDLOPS #ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 1 +#define CK_USE_AMD_XDLOPS 0 #endif #ifndef CK_USE_AMD_XDLOPS_INLINE_ASM diff --git a/composable_kernel/include/utility/print_array.hpp b/composable_kernel/include/utility/print_array.hpp index 34769af2f..b53bbb90f 100644 --- a/composable_kernel/include/utility/print_array.hpp +++ b/composable_kernel/include/utility/print_array.hpp @@ -1,5 +1,5 @@ -#ifndef CK_ARRAY_HELPER_HPP -#define CK_ARRAY_HELPER_HPP +#ifndef CK_PRINT_ARRAY_HPP +#define CK_PRINT_ARRAY_HPP #include "array.hpp" diff --git a/composable_kernel/include/utility/print_sequence.hpp b/composable_kernel/include/utility/print_sequence.hpp index 71abfea1f..463f9d097 100644 --- a/composable_kernel/include/utility/print_sequence.hpp +++ b/composable_kernel/include/utility/print_sequence.hpp @@ -1,5 +1,5 @@ -#ifndef CK_SEQUENCE_HELPER_HPP -#define CK_SEQUENCE_HELPER_HPP +#ifndef CK_PRINT_SEQUENCE_HPP +#define CK_PRINT_SEQUENCE_HPP #include "sequence.hpp" diff --git a/composable_kernel/include/utility/sequence_helper.hpp b/composable_kernel/include/utility/sequence_helper.hpp deleted file mode 100644 index 71abfea1f..000000000 --- a/composable_kernel/include/utility/sequence_helper.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef CK_SEQUENCE_HELPER_HPP -#define CK_SEQUENCE_HELPER_HPP - -#include "sequence.hpp" - -namespace ck { - -template -__host__ __device__ void print_sequence(const char* s, Sequence) -{ - constexpr index_t nsize = Sequence::Size(); - - static_assert(nsize <= 10, "wrong!"); - - static_if{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); }); - - static_if{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); }); - - static_if{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); }); - - static_if{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); }); - - static_if{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); - - static_if{}( - [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); -} - -} // namespace ck - -#endif diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 1a819e112..3a0eedc64 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -5,6 +5,8 @@ #include #include "config.hpp" #include "ConstantTensorDescriptor_deprecated.hpp" +#include "print_array.hpp" +#include "print_sequence.hpp" #include "device.hpp" #include "conv_common.hpp" #include "host_conv.hpp" From 093306c16292a09b2222c6a588fa9fe64a1be257 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 9 Oct 2019 20:32:52 -0500 Subject: [PATCH 14/20] bring in more miopen changes --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 89 +++--- .../ConstantTensorDescriptor_deprecated.hpp | 10 +- .../tensor_description/tensor_coordinate.hpp | 21 +- .../tensor_description/tensor_descriptor.hpp | 22 +- .../blockwise_generic_tensor_slice_copy.hpp | 56 ++-- .../threadwise_generic_tensor_slice_copy.hpp | 31 +- ...e_generic_tensor_slice_copy_deprecated.hpp | 4 +- .../include/utility/amd_buffer_addressing.hpp | 284 ++++++++++++++++++ .../include/utility/common_header.hpp | 4 +- .../include/utility/config.amd.hpp.in | 14 +- 10 files changed, 429 insertions(+), 106 deletions(-) create mode 100644 composable_kernel/include/utility/amd_buffer_addressing.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 09d275913..18c7f9a39 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -158,7 +158,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer "be violated"); // divide block work by [K, B] - static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0, + static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0, "wrong! cannot divide work evenly among block"); constexpr index_t KBlockWork = K / KPerBlock; @@ -173,7 +173,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; // input tensor - // global memory + // global tensor in global memory constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( in_n_c_hi_wi_global_desc, make_tuple( @@ -190,6 +190,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); + // global tensor in global memory, src of blockwise copy constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor( in_n0_n1_n2_c_y_ho_x_wo_global_desc, make_tuple(Merge>{}, @@ -199,7 +200,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy + // block tensor in LDS memory, dst of blockwise copy // be careful of LDS alignment constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); @@ -210,9 +211,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not satisfied"); - // input blockwise copy - // slice a merged tensor, reorder and copy to a normal tensor - // this copy operator already has blockwise offset built-in + // input tensor blockwise copy auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v4{}(wei_k_c_y_x_global_desc); - // tensor descriptor in LDS, dst of blockwise copy + // block tensor in LDS memory, dst of blockwise copy // be careful of LDS alignment constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, @@ -248,9 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, "GemmDataPerReadA alignment requirement is not satisfied"); - // operator for blockwise copy of weight into LDS - // slice a tensor, and copy it into another tensor - // this copy operator already have blockwise offset built-in + // weight tensor blockwise copy auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v4{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + if(has_two_iteration_left) // if has 2 iteration left + { + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - __syncthreads(); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); - // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + __syncthreads(); - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); + // LDS double buffer: load last data from device mem + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); + + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); - // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, - p_wei_block_double + wei_block_space); + // LDS double buffer: store last data to LDS + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); + + __syncthreads(); - // odd iteration - __syncthreads(); + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double + wei_block_space, + p_in_block_double + in_block_space, + p_out_thread); + } + else // if has 1 iteration left + { + __syncthreads(); - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(p_wei_block_double + wei_block_space, - p_in_block_double + in_block_space, - p_out_thread); + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); + } } // copy output: register to global memory @@ -420,12 +427,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster; constexpr index_t K0 = K / K1; - // define tensor descriptor for threadwise copy - // output memory layout descriptor in register, src of threadwise copy + // define output tensor descriptor for threadwise copy + // thread output tensor, src of threadwise copy constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed( Sequence{}); - // output memory layout descriptor in device memory + // global output tensor constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor( out_n_k_ho_wo_global_desc, make_tuple(UnMerge>{}, @@ -435,7 +442,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{})); - // output merged global tensor descriptor, dst of threadwise copy + // global output tensor, dst of threadwise copy constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor( out_n0_n1_n2_k0_k1_ho_wo_global_desc, make_tuple(PassThrough{}, diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp index d14696414..d745f69f8 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp @@ -6,7 +6,7 @@ namespace ck { template -__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths) +__host__ __device__ constexpr auto calculate_tensor_strides_packed_deprecated(Lengths) { return reverse_inclusive_scan_sequence( Lengths{}.PopFront(), math::multiplies{}, Number<1>{}) @@ -19,7 +19,7 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, constexpr index_t L_back_align = Align * math::integer_divide_ceiler{}(Lengths{}.Back(), Align); - return calculate_tensor_strides_packed_old( + return calculate_tensor_strides_packed_deprecated( Lengths{}.Modify(Number{}, Number{})); } @@ -186,7 +186,7 @@ struct ConstantTensorDescriptor_deprecated { Array multi_id; - using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths())); + using PackedStrides = decltype(calculate_tensor_strides_packed_deprecated(GetLengths())); // calculate index in each of the dimensions in the order of their dimension static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex(id, multi_id)); @@ -467,7 +467,7 @@ struct ConstantTensorDescriptor_deprecated __host__ __device__ static constexpr auto Pack() { - using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); + using packed_strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{})); return ConstantTensorDescriptor_deprecated{}; } @@ -491,7 +491,7 @@ struct ConstantTensorDescriptor_deprecated template __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) { - using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); + using Strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{})); return ConstantTensorDescriptor_deprecated{}; } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 5ce5bc700..66dda13c4 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -8,9 +8,24 @@ namespace ck { +// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor +// At the bare minimun, user should be able to query the following information from a tensor +// coordinate: +// 1. Tensor descriptor +// 2. Location, represented in the form of multi-index +// 3. Location, represented in the form of the offset to the origin of the tensor +// 4. If the location is inside invalid area or not, i.e. the padding area of an implicitly padded +// tensor is considered invalid, because the padding area doesn't have any physical memory +// allocation +// A tensor cooridnate also provides following functionality: +// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user +// can freely move the "point of location" inside the tensor + +// wrapper class for NativeTensorCoordinate and TransformedTensorCoordinate template struct TensorCoordinate; +// tensor coordinate for native tensor template struct NativeTensorCoordinate { @@ -78,12 +93,10 @@ struct NativeTensorCoordinate return coord; } -#if 0 // tweaking __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff) { return tensor_desc_type::CalculateOffsetDiff(idx_diff); } -#endif __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } @@ -96,6 +109,7 @@ struct NativeTensorCoordinate index_t mOffset; }; +// tensor coordinate for transformed tensor template struct TransformedTensorCoordinate { @@ -177,10 +191,10 @@ struct TransformedTensorCoordinate return coord_up; } -#if 0 // tweaking // Calculate offset diff without updating tensor-coordinate // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions, // then all calculation can be done at compile-time. + // TODO: this function is not compiled to expected ISA __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const { // For transformation of multi-index difference, not all transformation functions need to @@ -191,7 +205,6 @@ struct TransformedTensorCoordinate return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff); } -#endif __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const { diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index e202f73e9..dec7e2b8d 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -7,6 +7,8 @@ namespace ck { +// tensor descriptor for "native tensor" +// A "native tensor" is a "true" tensor that can be represented by Lengths and Strides template struct NativeTensorDescriptor { @@ -113,12 +115,10 @@ struct NativeTensorDescriptor __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; } -#if 0 __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() { return Tuple<>{}; } -#endif __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset(const Index& /* idx */) @@ -127,14 +127,11 @@ struct NativeTensorDescriptor } }; -// LowerTensorDescriptor -// Transforms: Tuple -// LowerDimensionIds: Tuple> -// UpperDimensionIds: Tuple> -template +// Tensor descriptor for "transformed tensor" +template + typename LowDimensionIds, // Tuple> + typename UpDimensionIds> // Tuple> struct TransformedTensorDescriptor { using type = TransformedTensorDescriptor; @@ -412,6 +409,7 @@ struct TransformedTensorDescriptor { #if 0 // create tuple of linear dimension masks, for all transformations + // TODO: this doesn't compile, because transform_tuples() complain about constexpr constexpr auto tuple_of_linear_dimension_mask = transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{}, Transforms{}, @@ -419,7 +417,7 @@ struct TransformedTensorDescriptor UpDimensionIds{}); #else // create tuple of linear dimension masks, for all transformations - // TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr + // TODO: this is a hack constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl( lambda_get_linear_dimension_mask_of_single_tranform{}, Transforms{}, @@ -465,7 +463,7 @@ struct TransformedTensorDescriptor #if 0 __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() { - // not implemented + // TODO: not implemented } #endif diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 15faeaebf..38ec363a7 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -73,18 +73,22 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, ThreadBufferData* p_thread_buffer) const { -#if 1 - mThreadwiseLoad.template Run(p_block_src, p_thread_buffer); -#else // tweaking - mThreadwiseLoad.template Run_optimized_src_address_calculation( - p_block_src, p_thread_buffer); -#endif + if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) + { + mThreadwiseLoad + .template Run_optimized_src_address_calculation( + p_block_src, p_thread_buffer); + } + else + { + mThreadwiseLoad.template Run(p_block_src, p_thread_buffer); + } } template (p_thread_buffer, p_block_dst); -#else // tweaking - mThreadwiseStore.template Run_optimized_dst_address_calculation( - p_thread_buffer, p_block_dst); -#endif + if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) + { + mThreadwiseStore + .template Run_optimized_dst_address_calculation( + p_thread_buffer, p_block_dst); + } + else + { + mThreadwiseStore.template Run(p_thread_buffer, p_block_dst); + } } template {}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( p_src, src_coord.GetOffset(), 0); @@ -160,7 +160,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 if(dst_coord.IsUpperIndexMappedToValidOffset()) { static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, @@ -194,6 +194,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of src tensor + // TODO: this function is not compiled to expected ISA template {}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); @@ -352,6 +353,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of dst tensor + // TODO: this function is not compiled to expected ISA template {}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, @@ -506,6 +508,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 }); } + __device__ static constexpr bool HasWorkingOptimizedAddressCalculation() + { +#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking + return true; +#else + return false; +#endif + } + template __device__ void MoveSrcSliceWindow(const T& step_sizes_, integral_constant) diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index c70929f3f..7d85b3838 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -331,7 +331,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated // algorithm) // 3. src_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING vector_data = __buffer_load( p_src, src_merged_offset, src_normal_offset); #else @@ -440,7 +440,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated // algorithm) // 3. dst_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC +#if CK_USE_AMD_BUFFER_ADDRESSING __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); #else diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp new file mode 100644 index 000000000..4bb6f2693 --- /dev/null +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -0,0 +1,284 @@ +#ifndef CK_AMD_BUFFER_ADDRESSING_HPP +#define CK_AMD_BUFFER_ADDRESSING_HPP + +#include "float_type.hpp" + +namespace ck { + +// For 128bit SGPRs in buffer_load and buffer_store instructions +// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions +template +union BufferLoadStoreDwordConfig +{ + int32x4_t data; + T* address[2]; + int32_t range[4]; +}; + +__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load"); + +__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); + +__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); + +__device__ void __llvm_amdgcn_buffer_store(float vdata, + int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store"); + +__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata, + int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); + +__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, + int32x4_t rsrc, + index_t vindex, + index_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); + +template +__device__ typename vector_type::MemoryType +__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); + +template +__device__ void __buffer_store(const typename vector_type::MemoryType& src, + T* p_dst_block, + index_t dst_thread_data_offset, + index_t dst_const_data_offset); + +template <> +__device__ float __buffer_load(const float* p_src_block, + index_t src_thread_data_offset, + index_t src_const_data_offset) +{ + float dst; + + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig src_block_config; + + // fill in byte 0 - 1 + src_block_config.address[0] = const_cast(p_src_block); + // fill in byte 2 + src_block_config.range[2] = -1; + // fill in byte 3 + src_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + dst = __llvm_amdgcn_buffer_load( + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#else + asm volatile( + "\n \ + buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); +#endif + + return dst; +} + +template <> +__device__ float2_t __buffer_load(const float* p_src_block, + index_t src_thread_data_offset, + index_t src_const_data_offset) +{ + float2_t dst; + + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig src_block_config; + + // fill in byte 0 - 1 + src_block_config.address[0] = const_cast(p_src_block); + // fill in byte 2 + src_block_config.range[2] = -1; + // fill in byte 3 + src_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + dst = __llvm_amdgcn_buffer_loadx2( + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#else + asm volatile( + "\n \ + buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); +#endif + + return dst; +} + +template <> +__device__ float4_t __buffer_load(const float* p_src_block, + index_t src_thread_data_offset, + index_t src_const_data_offset) +{ + float4_t dst; + + index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig src_block_config; + + // fill in byte 0 - 1 + src_block_config.address[0] = const_cast(p_src_block); + // fill in byte 2 + src_block_config.range[2] = -1; + // fill in byte 3 + src_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + dst = __llvm_amdgcn_buffer_loadx4( + src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#else + asm volatile( + "\n \ + buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); +#endif + + return dst; +} + +template <> +__device__ void __buffer_store(const float& src, + float* p_dst_block, + index_t dst_thread_data_offset, + index_t dst_const_data_offset) +{ + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig dst_block_config; + + // fill in byte 0 - 1 + dst_block_config.address[0] = p_dst_block; + // fill in byte 2 + dst_block_config.range[2] = -1; + // fill in byte 3 + dst_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + __llvm_amdgcn_buffer_store(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); +#else + asm volatile("\n \ + buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_config.data), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#endif +} + +template <> +__device__ void __buffer_store(const float2_t& src, + float* p_dst_block, + index_t dst_thread_data_offset, + index_t dst_const_data_offset) +{ + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig dst_block_config; + + // fill in byte 0 - 1 + dst_block_config.address[0] = p_dst_block; + // fill in byte 2 + dst_block_config.range[2] = -1; + // fill in byte 3 + dst_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + __llvm_amdgcn_buffer_storex2(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); +#else + asm volatile("\n \ + buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_config.data), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#endif +} + +template <> +__device__ void __buffer_store(const float4_t& src, + float* p_dst_block, + index_t dst_thread_data_offset, + index_t dst_const_data_offset) +{ + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + BufferLoadStoreDwordConfig dst_block_config; + + // fill in byte 0 - 1 + dst_block_config.address[0] = p_dst_block; + // fill in byte 2 + dst_block_config.range[2] = -1; + // fill in byte 3 + dst_block_config.range[3] = 0x00027000; + +#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC + __llvm_amdgcn_buffer_storex4(src, + dst_block_config.data, + 0, + dst_thread_addr_offset + dst_const_addr_offset, + false, + false); +#else + asm volatile("\n \ + buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_config.data), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#endif +} + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index e01ec6efc..588efca08 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -20,8 +20,8 @@ #include "amd_inline_asm.hpp" #endif -#if CK_USE_AMD_INTRINSIC -#include "amd_intrinsic.hpp" +#if CK_USE_AMD_BUFFER_ADDRESSING +#include "amd_buffer_addressing.hpp" #endif #endif diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 7800f5293..3e19b5676 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -20,18 +20,18 @@ #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 #endif -// AMD llvm intrinsic -#ifndef CK_USE_AMD_INTRINSIC -#define CK_USE_AMD_INTRINSIC 1 +// AMD buffer addressing +#ifndef CK_USE_AMD_BUFFER_ADDRESSING +#define CK_USE_AMD_BUFFER_ADDRESSING 1 #endif -#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC -#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1 +#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC +#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1 #endif // AMD XDLOPS #ifndef CK_USE_AMD_XDLOPS -#define CK_USE_AMD_XDLOPS 0 +#define CK_USE_AMD_XDLOPS 1 #endif #ifndef CK_USE_AMD_XDLOPS_INLINE_ASM @@ -40,6 +40,8 @@ // experimental implementation #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 +#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 +#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 From 0e5a67f14746aadd95dcae24f215bc03a89536ca Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 9 Oct 2019 21:21:57 -0500 Subject: [PATCH 15/20] refactor --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 46 ++++---- .../blockwise_generic_tensor_slice_copy.hpp | 100 +++++++++++++----- .../threadwise_generic_tensor_slice_copy.hpp | 18 +++- driver/src/driver.cpp | 4 +- 4 files changed, 115 insertions(+), 53 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 18c7f9a39..289c8621b 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -100,6 +100,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto generic_address_space = + integral_constant{}; + constexpr auto global_address_space = + integral_constant{}; + static_assert(ConvDirection == ConvolutionDirection::Forward || ConvDirection == ConvolutionDirection::BackwardWeight, "wrong! this kernel only support convolution forward and backward-weight"); @@ -114,13 +126,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, "wrong!"); - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto True = integral_constant{}; - constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; @@ -329,10 +334,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.Run( + p_in_global, p_in_block_double, global_address_space, generic_address_space); + blockwise_wei_copy.Run( + p_wei_global, p_wei_block_double, global_address_space, generic_address_space); } // LDS double buffer: main body @@ -363,10 +368,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -392,10 +397,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS double buffer: load last data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); // LDS double buffer: GEMM on 2nd-last data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); @@ -482,8 +487,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) - .template Run( - p_out_thread, p_out_global); + .Run(p_out_thread, p_out_global, generic_address_space, global_address_space); } } }; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 38ec363a7..34560977c 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -68,10 +68,13 @@ struct BlockwiseGenericTensorSliceCopy_v4 template - __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, - ThreadBufferData* p_thread_buffer) const + AddressSpace BlockSrcAddressSpace, + AddressSpace ThreadBufferAddressSpace> + __device__ void + RunLoadThreadBuffer(const BlockSrcData* p_block_src, + ThreadBufferData* p_thread_buffer, + integral_constant, + integral_constant) const { if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) { @@ -84,19 +87,36 @@ struct BlockwiseGenericTensorSliceCopy_v4 } else { - mThreadwiseLoad.template Run(p_block_src, p_thread_buffer); + constexpr auto block_src_address_space = + integral_constant{}; + constexpr auto thread_buffer_address_space = + integral_constant{}; + + mThreadwiseLoad.Run( + p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); } } + template + __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, + ThreadBufferData* p_thread_buffer) const + { + constexpr auto generic_address_space = + integral_constant{}; + + RunLoadThreadBuffer( + p_block_src, p_thread_buffer, generic_address_space, generic_address_space); + } + template - __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, - BlockDstData* p_block_dst) const + AddressSpace ThreadBufferAddressSpace, + AddressSpace BlockDstAddressSpace> + __device__ void + RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, + BlockDstData* p_block_dst, + integral_constant, + integral_constant) const { if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) { @@ -109,31 +129,57 @@ struct BlockwiseGenericTensorSliceCopy_v4 } else { - mThreadwiseStore.template Run(p_thread_buffer, p_block_dst); + constexpr auto thread_buffer_address_space = + integral_constant{}; + constexpr auto block_dst_address_space = + integral_constant{}; + + mThreadwiseStore.Run( + p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); } } + template + __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, + BlockDstData* p_block_dst) const + { + constexpr auto generic_address_space = + integral_constant{}; + + RunStoreThreadBuffer( + p_thread_buffer, p_block_dst, generic_address_space, generic_address_space); + } + template - __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const + AddressSpace BlockSrcAddressSpace, + AddressSpace BlockDstAddressSpace> + __device__ void + Run(const BlockSrcData* p_block_src, + BlockDstData* p_block_dst, + integral_constant block_src_address_space, + integral_constant block_dst_address_space) const { BlockSrcData p_thread_buffer[GetThreadBufferSize()]; - RunLoadThreadBuffer(p_block_src, p_thread_buffer); + constexpr auto generic_address_space = + integral_constant{}; + + RunLoadThreadBuffer( + p_block_src, p_thread_buffer, block_src_address_space, generic_address_space); // if there is type conversion, it's done during store - RunStoreThreadBuffer(p_thread_buffer, p_block_dst); + RunStoreThreadBuffer( + p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space); + } + + template + __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const + { + constexpr auto generic_address_space = + integral_constant{}; + + Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space); } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 8d5b035e9..0cf6d4b4c 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -68,9 +68,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on dst data: No write if dst data is in paddin area. template - __device__ void Run(const SrcData* p_src, DstData* p_dst) const + AddressSpace SrcAddressSpace, + AddressSpace DstAddressSpace> + __device__ void Run(const SrcData* p_src, + DstData* p_dst, + integral_constant, + integral_constant) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -180,6 +183,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 }); } + template + __device__ void Run(const SrcData* p_src, DstData* p_dst) const + { + constexpr auto generic_address_space = + integral_constant{}; + + Run(p_src, p_dst, generic_address_space, generic_address_space); + } + // Modify Length to 1, if Mask is set to false // Used for isolating linear dimension from non-linear dimensions template diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 3a0eedc64..251a38124 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -297,7 +297,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -343,7 +343,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; From 85bed32ec82a2aa21f41b5429657468c7c0c807b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 9 Oct 2019 21:52:22 -0500 Subject: [PATCH 16/20] refactor --- ...kcyx_nkhw_lds_double_buffer_deprecated.hpp | 49 ++- ...e_generic_tensor_slice_copy_deprecated.hpp | 106 +++-- ...e_generic_tensor_slice_copy_deprecated.hpp | 18 +- .../include/utility/amd_intrinsic.hpp | 368 ------------------ ...it_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp | 2 + driver/src/driver.cpp | 6 +- 6 files changed, 144 insertions(+), 405 deletions(-) delete mode 100644 composable_kernel/include/utility/amd_intrinsic.hpp diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp index db92631a3..3e5935dc5 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp @@ -8,18 +8,51 @@ #include "blockwise_generic_tensor_slice_copy_deprecated.hpp" #include "blockwise_gemm.hpp" #include "threadwise_generic_tensor_slice_copy_deprecated.hpp" +#include "convolution_common.hpp" namespace ck { +template +struct make_wei_e_k_global_desc_v4r1_deprecated; + +template <> +struct make_wei_e_k_global_desc_v4r1_deprecated +{ + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I1 = Number<1>{}; + constexpr auto I3 = Number<3>{}; + + return WeiDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + } +}; + +template <> +struct make_wei_e_k_global_desc_v4r1_deprecated +{ + template + __device__ constexpr auto operator()(WeiDesc) const + { + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + return make_ConstantMergedTensorDescriptor( + WeiDesc::Unfold(I2, I3), Sequence<1, 2>{}, Sequence<0>{}); + } +}; + // define B = merge(N0, Ho, Wo) template ({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor - // tensor descriptor in device memory, src of blockwise copy + // Iensor descriptor in device memory, src of blockwise copy + // It is constructed differently, depending on whether forward or backward weight + // convolution constexpr auto wei_e_k_global_desc = - wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + make_wei_e_k_global_desc_v4r1_deprecated{}(wei_k_c_y_x_global_desc); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment @@ -256,7 +295,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; + AccDataType p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread); @@ -394,11 +433,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep 0, b_thread_data_on_global, 0}) - .template Run( + .template Run( p_out_thread, p_out_global); } } }; } // namespace ck -#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP +#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index 399a47407..2272ab017 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -475,42 +475,96 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated return ThreadBufferDesc::GetElementSpace(); } - template - __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const + template + __device__ void + RunLoadThreadBuffer(const BlockSrcData* p_block_src, + ThreadBufferData* p_thread_buffer, + integral_constant, + integral_constant) const + { + constexpr auto block_src_address_space = + integral_constant{}; + constexpr auto thread_buffer_address_space = + integral_constant{}; + + mThreadwiseLoad.Run( + p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); + } + + template + __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, + ThreadBufferData* p_thread_buffer) const + { + constexpr auto generic_address_space = + integral_constant{}; + + RunLoadThreadBuffer( + p_block_src, p_thread_buffer, generic_address_space, generic_address_space); + } + + template + __device__ void + RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, + BlockDstData* p_block_dst, + integral_constant, + integral_constant) const { - mThreadwiseLoad - .template Run( - p_block_src, p_thread_buffer); + constexpr auto thread_buffer_address_space = + integral_constant{}; + constexpr auto block_dst_address_space = + integral_constant{}; + + mThreadwiseStore.Run( + p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); } - template - __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const + template + __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, + BlockDstData* p_block_dst) const { - mThreadwiseStore - .template Run( - p_thread_buffer, p_block_dst); + constexpr auto generic_address_space = + integral_constant{}; + + RunStoreThreadBuffer( + p_thread_buffer, p_block_dst, generic_address_space, generic_address_space); } - template - __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const + template + __device__ void + Run(const BlockSrcData* p_block_src, + BlockDstData* p_block_dst, + integral_constant block_src_address_space, + integral_constant block_dst_address_space) const { - SrcData p_thread_buffer[GetThreadBufferSize()]; + BlockSrcData p_thread_buffer[GetThreadBufferSize()]; + + constexpr auto generic_address_space = + integral_constant{}; - RunLoadThreadBuffer( - p_block_src, p_thread_buffer); + RunLoadThreadBuffer( + p_block_src, p_thread_buffer, block_src_address_space, generic_address_space); // if there is type conversion, it's done during store - RunStoreThreadBuffer( - p_thread_buffer, p_block_dst); + RunStoreThreadBuffer( + p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space); + } + + template + __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const + { + constexpr auto generic_address_space = + integral_constant{}; + + Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space); } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index 7d85b3838..ceee79ca6 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -256,9 +256,12 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated template - __device__ void Run(const SrcData* p_src, DstData* p_dst) const + AddressSpace SrcAddressSpace, + AddressSpace DstAddressSpace> + __device__ void Run(const SrcData* p_src, + DstData* p_dst, + integral_constant, + integral_constant) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -457,6 +460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated } } + template + __device__ void Run(const SrcData* p_src, DstData* p_dst) const + { + constexpr auto generic_address_space = + integral_constant{}; + + Run(p_src, p_dst, generic_address_space, generic_address_space); + } + // T can be Sequence or Array template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp deleted file mode 100644 index 2575cbc40..000000000 --- a/composable_kernel/include/utility/amd_intrinsic.hpp +++ /dev/null @@ -1,368 +0,0 @@ -#ifndef CK_AMD_INTRINSIC_HPP -#define CK_AMD_INTRINSIC_HPP - -#include "float_type.hpp" - -namespace ck { - -// for buffer_load and buffer_store -template -union BufferLoadStoreDwordConfig -{ - int32x4_t data; - T* address[2]; - int32_t range[4]; -}; - -__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load"); - -__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); - -__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); - -__device__ void __llvm_amdgcn_buffer_store(float vdata, - int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store"); - -__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata, - int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); - -__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata, - int32x4_t rsrc, - index_t vindex, - index_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); - -// buffer_load and buffer_store -template -__device__ typename vector_type::MemoryType -__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); - -template -__device__ void __buffer_store(const typename vector_type::MemoryType& src, - T* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset); - -template <> -__device__ float __buffer_load(const float* p_src_block, - index_t src_thread_data_offset, - index_t src_const_data_offset) -{ -#if 0 - float dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - asm volatile( - "\n \ - buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); - - return dst; -#else - float dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_load( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ float2_t __buffer_load(const float* p_src_block, - index_t src_thread_data_offset, - index_t src_const_data_offset) -{ -#if 0 - float2_t dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - asm volatile( - "\n \ - buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); - - return dst; -#else - float2_t dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_loadx2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ float4_t __buffer_load(const float* p_src_block, - index_t src_thread_data_offset, - index_t src_const_data_offset) -{ -#if 0 - float4_t dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - asm volatile( - "\n \ - buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset)); - - return dst; -#else - float4_t dst; - - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig src_block_config; - - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_loadx4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ void __buffer_store(const float& src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ -#if 0 - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_config.data), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - __llvm_amdgcn_buffer_store(src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); -#endif -} - -template <> -__device__ void __buffer_store(const float2_t& src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ -#if 0 - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_config.data), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex2(src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); -#endif -} - -template <> -__device__ void __buffer_store(const float4_t& src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ -#if 0 - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_config.data), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - BufferLoadStoreDwordConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex4(src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); -#endif -} - -} // namespace ck -#endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp index 5a47feb6e..626dd77dd 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp @@ -174,11 +174,13 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc, GridSize, BlockSize, T, + T, decltype(in_nchw_desc), decltype(wei_kcyx_desc), decltype(out_nkhw_desc), ConvStrides, ConvDilations, + ConvolutionDirection::Forward, BPerBlock, KPerBlock, EPerBlock, diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 251a38124..67fa14db5 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -297,7 +297,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -343,7 +343,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -482,7 +482,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, From 89f2cb4a9a8a1db1d8757b7efe2cc7332793734d Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 9 Oct 2019 22:10:52 -0500 Subject: [PATCH 17/20] refactor --- ...kcyx_nkhw_lds_double_buffer_deprecated.hpp | 46 ++++++++++--------- .../blockwise_generic_tensor_slice_copy.hpp | 36 ++++++--------- .../threadwise_generic_tensor_slice_copy.hpp | 22 +++++---- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp index 3e5935dc5..b5fde21c9 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp @@ -86,6 +86,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const { + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto generic_address_space = + integral_constant{}; + constexpr auto global_address_space = + integral_constant{}; + static_assert(ConvDirection == ConvolutionDirection::Forward || ConvDirection == ConvolutionDirection::BackwardWeight, "wrong! this kernel only support convolution forward and backward-weight"); @@ -100,13 +112,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep 0, "wrong!"); - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto True = integral_constant{}; - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; @@ -302,10 +307,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.Run( + p_in_global, p_in_block_double, global_address_space, generic_address_space); + blockwise_wei_copy.Run( + p_wei_global, p_wei_block_double, global_address_space, generic_address_space); } // LDS double buffer: main body @@ -336,10 +341,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -362,10 +367,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); @@ -433,8 +438,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep 0, b_thread_data_on_global, 0}) - .template Run( - p_out_thread, p_out_global); + .Run(p_out_thread, p_out_global, generic_address_space, global_address_space); } } }; diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 34560977c..b50e27ed6 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -76,22 +76,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 integral_constant, integral_constant) const { + constexpr auto block_src_address_space = + integral_constant{}; + constexpr auto thread_buffer_address_space = + integral_constant{}; + if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) { - mThreadwiseLoad - .template Run_optimized_src_address_calculation( - p_block_src, p_thread_buffer); + mThreadwiseLoad.Run_optimized_src_address_calculation( + p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); } else { - constexpr auto block_src_address_space = - integral_constant{}; - constexpr auto thread_buffer_address_space = - integral_constant{}; - mThreadwiseLoad.Run( p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); } @@ -118,22 +114,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 integral_constant, integral_constant) const { + constexpr auto thread_buffer_address_space = + integral_constant{}; + constexpr auto block_dst_address_space = + integral_constant{}; + if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) { - mThreadwiseStore - .template Run_optimized_dst_address_calculation( - p_thread_buffer, p_block_dst); + mThreadwiseStore.Run_optimized_dst_address_calculation( + p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); } else { - constexpr auto thread_buffer_address_space = - integral_constant{}; - constexpr auto block_dst_address_space = - integral_constant{}; - mThreadwiseStore.Run( p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 0cf6d4b4c..db70cbee0 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -209,10 +209,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO: this function is not compiled to expected ISA template - __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, - DstData* p_dst) const + AddressSpace SrcAddressSpace, + AddressSpace DstAddressSpace> + __device__ void + Run_optimized_src_address_calculation(const SrcData* p_src, + DstData* p_dst, + integral_constant, + integral_constant) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -368,10 +371,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // TODO: this function is not compiled to expected ISA template - __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, - DstData* p_dst) const + AddressSpace SrcAddressSpace, + AddressSpace DstAddressSpace> + __device__ void + Run_optimized_dst_address_calculation(const SrcData* p_src, + DstData* p_dst, + integral_constant, + integral_constant) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; From 871607a9ddbf5c7b493a6ef2a7d42c44701659dd Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 10 Oct 2019 02:39:20 -0500 Subject: [PATCH 18/20] nvidia build --- .../blockwise_generic_tensor_slice_copy.hpp | 12 ++++++++++-- .../include/utility/config.nvidia.hpp.in | 7 ++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index b50e27ed6..8939ae337 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -81,7 +81,11 @@ struct BlockwiseGenericTensorSliceCopy_v4 constexpr auto thread_buffer_address_space = integral_constant{}; - if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) + constexpr bool has_optimized_address_calculation = + decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); + + // TODO: threadwise copy is still being tweaked + if(has_optimized_address_calculation) { mThreadwiseLoad.Run_optimized_src_address_calculation( p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); @@ -119,7 +123,11 @@ struct BlockwiseGenericTensorSliceCopy_v4 constexpr auto block_dst_address_space = integral_constant{}; - if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation()) + constexpr bool has_optimized_address_calculation = + decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); + + // TODO: threadwise copy is still being tweaked + if(has_optimized_address_calculation) { mThreadwiseStore.Run_optimized_dst_address_calculation( p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); diff --git a/composable_kernel/include/utility/config.nvidia.hpp.in b/composable_kernel/include/utility/config.nvidia.hpp.in index 6e9198893..7c549cda5 100644 --- a/composable_kernel/include/utility/config.nvidia.hpp.in +++ b/composable_kernel/include/utility/config.nvidia.hpp.in @@ -15,15 +15,16 @@ // disable AMD inline asm and intrinsic #define CK_USE_AMD_INLINE_ASM 0 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0 +#define CK_USE_AMD_BUFFER_ADDRESSING 0 +#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0 #define CK_USE_AMD_XDLOPS 0 #define CK_USE_AMD_XDLOPS_INLINE_ASM 0 -#define CK_USE_AMD_INTRINSIC 0 -#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0 // experimental implementation #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0 +#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 +#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 From f489a603d73a83d05dc7082079755db7214655e3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 10 Oct 2019 15:45:36 -0500 Subject: [PATCH 19/20] refactor --- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 90 +++++++++++-------- .../threadwise_generic_tensor_slice_copy.hpp | 14 ++- ...e_generic_tensor_slice_copy_deprecated.hpp | 8 +- driver/src/driver.cpp | 4 +- 4 files changed, 63 insertions(+), 53 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index a547db7e3..30984136d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -61,6 +61,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer constexpr auto True = integral_constant{}; + constexpr auto generic_address_space = + integral_constant{}; + constexpr auto global_address_space = + integral_constant{}; + constexpr auto in_n_c_hi_wi_global_desc = make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); constexpr auto wei_k_c_y_x_global_desc = @@ -96,7 +101,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer "be violated"); // divide block work by [K, B] - static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0, + static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0, "wrong! cannot divide work evenly among block"); constexpr index_t KBlockWork = K / KPerBlock; @@ -255,10 +260,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.Run( + p_in_global, p_in_block_double, global_address_space, generic_address_space); + blockwise_wei_copy.Run( + p_wei_global, p_wei_block_double, global_address_space, generic_address_space); } // LDS double buffer: main body @@ -289,10 +294,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -305,37 +310,47 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: tail { - Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; - Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; + constexpr bool has_two_iteration_left = (E % (2 * EPerBlock) == 0); - // even iteration - blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); - blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); + if(has_two_iteration_left) // if has 2 iteration left + { + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; - __syncthreads(); + blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); + blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); - // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + __syncthreads(); - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); + // LDS double buffer: load last data from device mem + blockwise_in_copy.RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer, global_address_space, generic_address_space); + blockwise_wei_copy.RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space); - // LDS double buffer: store next data to LDS - blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, - p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, - p_wei_block_double + wei_block_space); + // LDS double buffer: GEMM on 2nd-last data + blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); - // odd iteration - __syncthreads(); + // LDS double buffer: store last data to LDS + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); - // LDS double buffer: GEMM on current data - blockwise_gemm.Run(p_wei_block_double + wei_block_space, - p_in_block_double + in_block_space, - p_out_thread); + __syncthreads(); + + // LDS double buffer: GEMM on current data + blockwise_gemm.Run(p_wei_block_double + wei_block_space, + p_in_block_double + in_block_space, + p_out_thread); + } + else // if has 1 iteration left + { + __syncthreads(); + + // LDS double buffer: GEMM on last data + blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); + } } // copy output: register to global memory @@ -388,14 +403,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer b_thread_data_on_global / B1, b_thread_data_on_global % B1}) #if 1 - .template Run + .Run(p_out_thread, p_out_global, generic_address_space, global_address_space); #else // tweaking - .template Run_optimized_dst_address_calculation + .Run_optimized_dst_address_calculation( + p_out_thread, p_out_global, generic_address_space, global_address_space); #endif - (p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index db70cbee0..1e3095d72 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -117,15 +117,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check src vector's padding situation, only check the first data in this src // vector. It's user's responsiblity to make sure all data in the src vector - // has - // the same padding situation + // has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( - p_src, src_coord.GetOffset(), 0); + fwd(p_src), src_coord.GetOffset(), 0); #else *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); @@ -158,15 +157,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check dst vector's padding situation, only check the first data in this dst // vector. It's user's responsiblity to make sure all data in the dst vector - // has - // the same padding situation + // has the same padding situation if(dst_coord.IsUpperIndexMappedToValidOffset()) { - static_if{}([&](auto) { + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), - p_dst, + fwd(p_dst), dst_coord.GetOffset(), 0); #else diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index ceee79ca6..f28ac1892 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -333,10 +333,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated // 2. src_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. src_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING vector_data = __buffer_load( - p_src, src_merged_offset, src_normal_offset); + fwd(p_src), src_merged_offset, src_normal_offset); #else vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); @@ -442,10 +442,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated // 2. dst_normal_offset must be calculatd at compile time (guaranteed by // algorithm) // 3. dst_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + static_if{}([&](auto fwd) { #if CK_USE_AMD_BUFFER_ADDRESSING __buffer_store( - vector_data, p_dst, dst_merged_offset, dst_normal_offset); + vector_data, fwd(p_dst), dst_merged_offset, dst_normal_offset); #else *reinterpret_cast( &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 67fa14db5..dccad8a5e 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -450,7 +450,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -482,7 +482,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc, in_nchw, wei_kcyx_desc, From b03aabf11527d0e26665a334b64c1b4e65a1b65f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 11 Oct 2019 11:29:15 -0500 Subject: [PATCH 20/20] rename, fix type --- ...icit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 +++++----- .../blockwise_generic_tensor_slice_copy.hpp | 2 +- .../blockwise_generic_tensor_slice_copy_deprecated.hpp | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 289c8621b..95fbeb290 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -293,14 +293,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx - constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( - Number{}, Number{}); + constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed( + Number{}, Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, decltype(a_e_k_block_mtx_desc), decltype(b_e_n1bn2_block_mtx_desc), - decltype(c_k0k2_n1n2_thread_mtx_desc), + decltype(c_k0k1_n1n2_thread_mtx_desc), GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster, @@ -327,10 +327,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; // register allocation for output - AccDataType p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()]; + AccDataType p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()]; // zero out threadwise output - threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); + threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread); // LDS double buffer: preload data into LDS { diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 8939ae337..d31b3902d 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -179,7 +179,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 constexpr auto generic_address_space = integral_constant{}; - Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space); + Run(p_block_src, p_block_dst, generic_address_space, generic_address_space); } template diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp index 2272ab017..c434e82f0 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp @@ -564,7 +564,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated constexpr auto generic_address_space = integral_constant{}; - Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space); + Run(p_block_src, p_block_dst, generic_address_space, generic_address_space); } template