From 4b570a7e05fe3456ddbbb3270a244b3f55225363 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 1 Oct 2019 12:20:23 -0500
Subject: [PATCH 01/20] refactor

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 16 +++++++--------
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 20 +++++++++----------
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 18 ++++++++---------
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 20 +++++++++----------
 .../blockwise_generic_tensor_slice_copy.hpp   | 16 +++++++--------
 ...e_generic_tensor_slice_copy_deprecated.hpp | 16 +++++++--------
 .../threadwise_generic_tensor_slice_copy.hpp  | 20 +++++++++----------
 ...e_generic_tensor_slice_copy_deprecated.hpp |  8 ++++----
 .../include/utility/config_amd.hpp.in         |  6 +++---
 .../include/utility/config_nvidia.hpp.in      |  6 +++---
 driver/src/driver.cpp                         |  6 +++---
 11 files changed, 76 insertions(+), 76 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 204b7ab86..ea1412064 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -265,9 +265,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, address_space_t::global>(
-                p_in_global, p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
+                                                                                 p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_block_double);
         }
 
@@ -300,10 +300,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
                 // LDS doubel buffer: load next data from device mem
                 blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_in_global, p_in_thread_buffer);
                 blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -327,9 +327,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -398,7 +398,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                     0,
                     b_thread_data_on_global,
                     0})
-                .template Run<Float, Float, address_space_t::generic, address_space_t::global>(
+                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>(
                     p_out_thread, p_out_global);
         }
     }
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index faf876450..a5a753158 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -281,9 +281,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, address_space_t::global>(
-                p_in_global, p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
+                                                                                 p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_block_double);
         }
 
@@ -316,10 +316,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
                 // LDS doubel buffer: load next data from device mem
                 blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_in_global, p_in_thread_buffer);
                 blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -343,9 +343,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -427,12 +427,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                       b_thread_data_on_global,
                                                       0})
 #if 1
-                .template Run<Float, Float, address_space_t::generic, address_space_t::global>
+                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>
 #else // tweaking
                 .template Run_optimized_dst_address_calculation<Float,
                                                                 Float,
-                                                                address_space_t::generic,
-                                                                address_space_t::global>
+                                                                AddressSpace_t::generic,
+                                                                AddressSpace_t::global>
 #endif
                 (p_out_thread, p_out_global);
         }
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index bc9a7c8be..55ca61926 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -251,10 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, address_space_t::global>(p_in_global,
-                                                                           p_in_block_double);
-            blockwise_wei_copy.template Run<Float, address_space_t::global>(p_wei_global,
-                                                                            p_wei_block_double);
+            blockwise_in_copy.template Run<Float, AddressSpace_t::global>(p_in_global,
+                                                                          p_in_block_double);
+            blockwise_wei_copy.template Run<Float, AddressSpace_t::global>(p_wei_global,
+                                                                           p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -285,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
                     p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
                     p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -311,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -391,7 +391,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
             {
                 threadwise_out_copy
-                    .template Run<Float, address_space_t::generic, address_space_t::global>(
+                    .template Run<Float, AddressSpace_t::generic, AddressSpace_t::global>(
                         p_out_thread, p_out_global);
 
                 threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index bee553f62..d39e11de2 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -255,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, address_space_t::global>(
-                p_in_global, p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
+                                                                                 p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_block_double);
         }
 
@@ -290,10 +290,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
 
                 // LDS doubel buffer: load next data from device mem
                 blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_in_global, p_in_thread_buffer);
                 blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                         p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -317,9 +317,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -390,12 +390,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                                                b_thread_data_on_global / B1,
                                                b_thread_data_on_global % B1})
 #if 1
-                .template Run<Float, Float, address_space_t::generic, address_space_t::global>
+                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>
 #else // tweaking
                 .template Run_optimized_dst_address_calculation<Float,
                                                                 Float,
-                                                                address_space_t::generic,
-                                                                address_space_t::global>
+                                                                AddressSpace_t::generic,
+                                                                AddressSpace_t::global>
 #endif
                 (p_out_thread, p_out_global);
         }
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 5e4ee81d2..69e98d4c8 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -68,8 +68,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename BlockSrcData,
               typename ThreadBufferData,
-              address_space_t BlockSrcAddressSpace     = address_space_t::generic,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic>
+              AddressSpace_t BlockSrcAddressSpace     = AddressSpace_t::generic,
+              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic>
     __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
                                         ThreadBufferData* p_thread_buffer) const
     {
@@ -89,8 +89,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename ThreadBufferData,
               typename BlockDstData,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic,
-              address_space_t BlockDstAddressSpace     = address_space_t::generic>
+              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t BlockDstAddressSpace     = AddressSpace_t::generic>
     __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
                                          BlockDstData* p_block_dst) const
     {
@@ -110,8 +110,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename BlockSrcData,
               typename BlockDstData,
-              address_space_t BlockSrcAddressSpace = address_space_t::generic,
-              address_space_t BlockDstAddressSpace = address_space_t::generic>
+              AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic>
     __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
     {
         BlockSrcData p_thread_buffer[GetThreadBufferSize()];
@@ -119,12 +119,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
         RunLoadThreadBuffer<BlockSrcData,
                             BlockSrcData,
                             BlockSrcAddressSpace,
-                            address_space_t::generic>(p_block_src, p_thread_buffer);
+                            AddressSpace_t::generic>(p_block_src, p_thread_buffer);
 
         // if there is type conversion, it's done during store
         RunStoreThreadBuffer<BlockSrcData,
                              BlockDstData,
-                             address_space_t::generic,
+                             AddressSpace_t::generic,
                              BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
     }
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index 9776b5413..881a88771 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -480,8 +480,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              address_space_t BlockSrcAddressSpace     = address_space_t::generic,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic>
+              AddressSpace_t BlockSrcAddressSpace     = AddressSpace_t::generic,
+              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic>
     __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
     {
         mThreadwiseLoad
@@ -491,8 +491,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              address_space_t ThreadBufferAddressSpace = address_space_t::generic,
-              address_space_t BlockDstAddressSpace     = address_space_t::generic>
+              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t BlockDstAddressSpace     = AddressSpace_t::generic>
     __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
     {
         mThreadwiseStore
@@ -502,17 +502,17 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              address_space_t BlockSrcAddressSpace = address_space_t::generic,
-              address_space_t BlockDstAddressSpace = address_space_t::generic>
+              AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic>
     __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
     {
         SrcData p_thread_buffer[GetThreadBufferSize()];
 
-        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, address_space_t::generic>(
+        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, AddressSpace_t::generic>(
             p_block_src, p_thread_buffer);
 
         // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<SrcData, DstData, address_space_t::generic, BlockDstAddressSpace>(
+        RunStoreThreadBuffer<SrcData, DstData, AddressSpace_t::generic, BlockDstAddressSpace>(
             p_thread_buffer, p_block_dst);
     }
 
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index c79089d31..0bd147d1b 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -76,8 +76,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on dst data: No write if dst data is in paddin area.
     template <typename SrcData,
               typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
     __device__ void Run(const SrcData* p_src, DstData* p_dst) const
     {
         using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
@@ -126,7 +126,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 //   the same padding situation
                 if(src_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
+                    static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                         *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                             __buffer_load<SrcData, SrcDataPerAccess>(
@@ -167,7 +167,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 //   the same padding situation
                 if(dst_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
+                    static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                         __buffer_store<DstData, DstDataPerAccess>(
                             *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
@@ -204,8 +204,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // This version is optimized for address calculation of src tensor
     template <typename SrcData,
               typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
     __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
                                                           DstData* p_dst) const
     {
@@ -302,7 +302,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     //   the src vector has the same padding situation
                     if(src_coord.IsUpperIndexMappedToValidOffset())
                     {
-                        static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
+                        static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                             *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                                 __buffer_load<SrcData, SrcDataPerAccess>(
@@ -362,8 +362,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // This version is optimized for address calculation of dst tensor
     template <typename SrcData,
               typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
     __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
                                                           DstData* p_dst) const
     {
@@ -491,7 +491,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     //   the dst vector has the same padding situation
                     if(dst_coord.IsUpperIndexMappedToValidOffset())
                     {
-                        static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
+                        static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                             __buffer_store<DstData, DstDataPerAccess>(
                                 *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index c271c6553..78684abe9 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -539,8 +539,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
 
     template <typename SrcData,
               typename DstData,
-              address_space_t SrcAddressSpace = address_space_t::generic,
-              address_space_t DstAddressSpace = address_space_t::generic>
+              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
+              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
     __device__ void Run(const SrcData* p_src, DstData* p_dst) const
     {
         constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
@@ -613,7 +613,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                         //     2. src_normal_offset must be calculatd at compile time (guaranteed by
                         //        algorithm)
                         //     3. src_merged_offset can be runtime value (no assumption imposed)
-                        static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
+                        static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                             vector_data = __buffer_load<SrcData, SrcDataPerAccess>(
                                 p_src, src_merged_offset, src_normal_offset);
@@ -722,7 +722,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                     //     2. dst_normal_offset must be calculatd at compile time (guaranteed by
                     //        algorithm)
                     //     3. dst_merged_offset can be runtime value (no assumption imposed)
-                    static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
+                    static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                         __buffer_store<SrcData, DstDataPerAccess>(
                             vector_data, p_dst, dst_merged_offset, dst_normal_offset);
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 437ed3ee8..e603ffcf0 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -16,10 +16,10 @@
 
 namespace ck {
 
-enum address_space_t
+enum AddressSpace_t
 {
-    generic = 0,
-    global  = 3
+    generic,
+    global
 };
 
 #if CK_UNSIGNED_INDEX_TYPE
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 9afce0298..67cd93136 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -18,10 +18,10 @@
 
 namespace ck {
 
-enum address_space_t
+enum AddressSpace_t
 {
-    generic = 0,
-    global  = generic
+    generic,
+    global = generic
 };
 
 #if CK_UNSIGNED_INDEX_TYPE
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index ab5b8826a..4319c4f7d 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
     // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
     // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
     constexpr index_t N  = 128;
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -438,7 +438,7 @@ int main(int argc, char* argv[])
 #elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
                                                          wei_kcyx_desc,

From 6559b0c0ec41ad90fbb0e6358e7a133ff3b8f630 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 2 Oct 2019 23:47:17 -0500
Subject: [PATCH 02/20] refactored fp32 blockwise and threadwise gemm

---
 .../ConstantMatrixDescriptor.hpp              |   5 +
 .../tensor_operation/blockwise_gemm.hpp       | 406 +++++------
 .../tensor_operation/threadwise_gemm.hpp      | 181 +++--
 .../include/utility/amd_inline_asm.hpp        | 678 +-----------------
 .../include/utility/bfloat16_dev.hpp          | 125 ++++
 .../include/utility/config_amd.hpp.in         |  56 +-
 composable_kernel/include/utility/math.hpp    |  62 ++
 7 files changed, 555 insertions(+), 958 deletions(-)
 create mode 100644 composable_kernel/include/utility/bfloat16_dev.hpp

diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
index f2f842e11..ada40e8ba 100644
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -32,6 +32,11 @@ struct ConstantMatrixDescriptor
         return irow * RowStride_ + icol;
     }
 
+    __host__ __device__ static index_t CalculateOffset(index_t irow, index_t icol)
+    {
+        return GetOffsetFromMultiIndex(irow, icol);
+    }
+
     template <index_t SubNRow, index_t SubNCol>
     __host__ __device__ static constexpr auto MakeSubMatrixDescriptor(Number<SubNRow>,
                                                                       Number<SubNCol>)
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index 819ecf0c4..71245a7a9 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -5,19 +5,15 @@
 #include "ConstantMatrixDescriptor.hpp"
 #include "threadwise_gemm.hpp"
 
-#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
-#endif
-
 namespace ck {
 
 // if following number are power of 2, index calculation shall be greatly reduced:
 //    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
 //    MLevel1ThreadCluster, NLevel1ThreadCluster
 template <index_t BlockSize,
-          class BlockMatrixA,
-          class BlockMatrixB,
-          class ThreadMatrixC,
+          typename BlockMatrixA,
+          typename BlockMatrixB,
+          typename ThreadMatrixC,
           index_t MPerThreadSubC,
           index_t NPerThreadSubC,
           index_t MLevel0ThreadCluster,
@@ -117,95 +113,29 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            n_repeat * NPerLevel1Cluster + n_in_sub_c};
     }
 
-#if CK_USE_AMD_INLINE_ASM
-    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
-                                const FloatB* __restrict__ p_b_block,
-                                FloatC* __restrict__ p_c_thread) const
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void
+    Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
     {
+        constexpr auto True  = integral_constant<bool, true>{};
+        constexpr auto False = integral_constant<bool, false>{};
+
         constexpr auto a_block_mtx  = BlockMatrixA{};
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
 
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
         constexpr index_t K = a_block_mtx.NRow();
 
         constexpr index_t MPerThread = c_thread_mtx.NRow();
         constexpr index_t NPerThread = c_thread_mtx.NCol();
 
-        // thread A, B for GEMM
-        constexpr auto a_thread_mtx =
-            make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<MPerThread>{});
-
-        constexpr auto b_thread_mtx =
-            make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
-
-        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
-
         constexpr index_t MPerLevel1Cluster =
             MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
         constexpr index_t NPerLevel1Cluster =
             NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
 
-        // assertion for inline asm
-        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
-                          is_same<FloatC, float>{},
-                      "Run_amd_asm only deal with float");
-
-        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
-                          MPerThread == 8 && NPerThread == 8,
-                      "Run_amd_asm cannot deal with this GEMM shape yet");
-
-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read");
-
-        using Float4 = vector_type<float, 4>::MemoryType;
-
-        Float4* reg_a = reinterpret_cast<Float4*>(p_a_thread);
-        Float4* reg_b = reinterpret_cast<Float4*>(p_b_thread);
-        Float4* reg_c = reinterpret_cast<Float4*>(p_c_thread);
-
-        reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA]);
-        reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB]);
-        reg_b[1] =
-            *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB + NPerLevel1Cluster]);
-        reg_a[1] =
-            *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA + MPerLevel1Cluster]);
-        outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-        outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
-#pragma unroll
-        for(index_t k = 1; k < K; ++k)
-        {
-            reg_a[0] = *reinterpret_cast<const Float4*>(&p_a_block[mMyThreadOffsetA + k * M]);
-            outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
-            reg_b[0] = *reinterpret_cast<const Float4*>(&p_b_block[mMyThreadOffsetB + k * N]);
-            outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
-            reg_b[1] = *reinterpret_cast<const Float4*>(
-                &p_b_block[mMyThreadOffsetB + k * N + NPerLevel1Cluster]);
-            reg_a[1] = *reinterpret_cast<const Float4*>(
-                &p_a_block[mMyThreadOffsetA + k * M + MPerLevel1Cluster]);
-            outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-            outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
-        }
-        outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
-        outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
-    }
-
-    __device__ void Run_amd_asm_v2(const float* __restrict__ p_a_block,
-                                   const float* __restrict__ p_b_block,
-                                   float* __restrict__ p_c_thread) const
-    {
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-        constexpr index_t M = a_block_mtx.NCol();
-        constexpr index_t N = b_block_mtx.NCol();
-        constexpr index_t K = a_block_mtx.NRow();
-
-        constexpr index_t MPerThread = c_thread_mtx.NRow();
-        constexpr index_t NPerThread = c_thread_mtx.NCol();
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
 
         // thread A, B for GEMM
         constexpr auto a_thread_mtx =
@@ -214,110 +144,65 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_thread_mtx =
             make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
-        float p_a_thread[a_thread_mtx.GetElementSpace()];
-        float p_b_thread[b_thread_mtx.GetElementSpace()];
-
-        constexpr index_t MThreadCluster = MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NThreadCluster = NLevel0ThreadCluster * NLevel1ThreadCluster;
-
-        constexpr index_t MDataCluster = M / MPerThreadSubC;
-        constexpr index_t NDataCluster = N / NPerThreadSubC;
-
-        constexpr index_t MRepeat = MDataCluster / MThreadCluster;
-        constexpr index_t NRepeat = NDataCluster / NThreadCluster;
+        // thread A-sub, B-sub for copy
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
 
-        // assertion for inline asm
-        static_assert((MPerThreadSubC == 4 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 &&
-                       KPerThreadLoop == 1) ||
-                          (MPerThreadSubC == 2 && NPerThreadSubC == 4 && MRepeat == 2 &&
-                           NRepeat == 2 && KPerThreadLoop == 1),
-                      "Run_amd_asm cannot deal with this GEMM shape yet");
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
 
-        static_assert(DataPerReadA == MPerThreadSubC && DataPerReadB == NPerThreadSubC,
-                      "wrong! Run_amd_asm doesn't support this config");
+        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
 
-        if(MPerThreadSubC == 4 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 &&
-           KPerThreadLoop == 1)
+        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy<BlockMatrixA,
+                                                                 decltype(a_thread_mtx),
+                                                                 KPerThreadLoop,
+                                                                 MPerThreadSubC,
+                                                                 DataPerReadA>{};
+
+        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy<BlockMatrixB,
+                                                                 decltype(b_thread_mtx),
+                                                                 KPerThreadLoop,
+                                                                 NPerThreadSubC,
+                                                                 DataPerReadB>{};
+
+        constexpr auto threadwise_gemm =
+            ThreadwiseGemmTransANormalBNormalC<decltype(a_thread_mtx),
+                                               decltype(b_thread_mtx),
+                                               decltype(c_thread_mtx)>{};
+#pragma unroll
+        // loop over k
+        for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
         {
-            using float4_type = vector_type<float, 4>::MemoryType;
-
-            float4_type* reg_a = reinterpret_cast<float4_type*>(p_a_thread);
-            float4_type* reg_b = reinterpret_cast<float4_type*>(p_b_thread);
-            float4_type* reg_c = reinterpret_cast<float4_type*>(p_c_thread);
-
-            const float4_type* p_a =
-                reinterpret_cast<const float4_type*>(&p_a_block[mMyThreadOffsetA]);
-            const float4_type* p_b =
-                reinterpret_cast<const float4_type*>(&p_b_block[mMyThreadOffsetB]);
-
-            reg_a[0] = p_a[0];
-            reg_b[0] = p_b[0];
-            reg_b[1] = p_b[NThreadCluster];
-            reg_a[1] = p_a[MThreadCluster];
-            outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-            outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
 #pragma unroll
-            for(index_t k = 1; k < K; ++k)
+            // read A
+            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
             {
-                reg_a[0] = p_a[k * MDataCluster];
-                outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
-                reg_b[0] = p_b[k * NDataCluster];
-                outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
-                reg_b[1] = p_b[k * NDataCluster + NThreadCluster];
-                reg_a[1] = p_a[k * MDataCluster + MThreadCluster];
-                outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-                outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
+                a_thread_copy.Run(
+                    p_a_block + a_block_mtx.CalculateOffset(k_begin, m_repeat * MPerLevel1Cluster) +
+                        mMyThreadOffsetA,
+                    p_a_thread + a_thread_mtx.CalculateOffset(0, m_repeat * MPerThreadSubC));
             }
-            outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
-            outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
-        }
-        else if(MPerThreadSubC == 2 && NPerThreadSubC == 4 && MRepeat == 2 && NRepeat == 2 &&
-                KPerThreadLoop == 1)
-        {
-            using float2_type = vector_type<float, 2>::MemoryType;
-            using float4_type = vector_type<float, 4>::MemoryType;
-
-            float2_type* reg_a = reinterpret_cast<float2_type*>(p_a_thread);
-            float4_type* reg_b = reinterpret_cast<float4_type*>(p_b_thread);
-            float4_type* reg_c = reinterpret_cast<float4_type*>(p_c_thread);
-
-            const float2_type* p_a =
-                reinterpret_cast<const float2_type*>(&p_a_block[mMyThreadOffsetA]);
-            const float4_type* p_b =
-                reinterpret_cast<const float4_type*>(&p_b_block[mMyThreadOffsetB]);
-
-            reg_a[0] = p_a[0];
-            reg_b[0] = p_b[0];
-            reg_b[1] = p_b[NThreadCluster];
-            reg_a[1] = p_a[MThreadCluster];
-            outerProduct2x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2]);
-            outerProduct2x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3]);
+
 #pragma unroll
-            for(index_t k = 1; k < K; ++k)
+            // read B
+            for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
-                reg_a[0] = p_a[k * MDataCluster];
-                outerProduct2x4(reg_a[1], reg_b[0], reg_c[4], reg_c[6]);
-                reg_b[0] = p_b[k * NDataCluster];
-                outerProduct2x4(reg_a[1], reg_b[1], reg_c[5], reg_c[7]);
-                reg_b[1] = p_b[k * NDataCluster + NThreadCluster];
-                reg_a[1] = p_a[k * MDataCluster + MThreadCluster];
-                outerProduct2x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2]);
-                outerProduct2x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3]);
+                b_thread_copy.Run(
+                    p_b_block + b_block_mtx.CalculateOffset(k_begin, n_repeat * NPerLevel1Cluster) +
+                        mMyThreadOffsetB,
+                    p_b_thread + b_thread_mtx.CalculateOffset(0, n_repeat * NPerThreadSubC));
             }
-            outerProduct2x4(reg_a[1], reg_b[0], reg_c[4], reg_c[6]);
-            outerProduct2x4(reg_a[1], reg_b[1], reg_c[5], reg_c[7]);
+
+            // C += A * B
+            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
         }
     }
-#endif
 
-    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_source(const FloatA* const __restrict__ p_a_block,
-                               const FloatB* const __restrict__ p_b_block,
-                               FloatC* const __restrict__ p_c_thread) const
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void
+    Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
     {
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
-
         constexpr auto a_block_mtx  = BlockMatrixA{};
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
@@ -327,88 +212,143 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr index_t MPerThread = c_thread_mtx.NRow();
         constexpr index_t NPerThread = c_thread_mtx.NCol();
 
-        // thread A, B for GEMM
+        constexpr index_t MPerLevel1Cluster =
+            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
+        constexpr index_t NPerLevel1Cluster =
+            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
+
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+
+        static_assert(MRepeat == 2 && NRepeat == 2,
+                      "wrong! inline asm cannot deal with this GEMM config yet");
+
+        // thread A, B
         constexpr auto a_thread_mtx =
             make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<MPerThread>{});
-
         constexpr auto b_thread_mtx =
             make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
-        // thread A-sub, B-sub for copy
-        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
+        // thread A-sub, B-sub
+        constexpr auto a_thread_sub_mtx = a_thread_mtx.MakeSubMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{});
+        constexpr auto b_thread_sub_mtx = b_thread_mtx.MakeSubMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{});
 
-        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+        // thread C-sub
+        constexpr auto c_thread_sub_mtx = ThreadMatrixC::MakeSubMatrixDescriptor(
+            Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{});
 
         FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
         FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
 
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
+        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy<BlockMatrixA,
+                                                                 decltype(a_thread_mtx),
+                                                                 KPerThreadLoop,
+                                                                 MPerThreadSubC,
+                                                                 DataPerReadA>{};
 
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy<BlockMatrixB,
+                                                                 decltype(b_thread_mtx),
+                                                                 KPerThreadLoop,
+                                                                 NPerThreadSubC,
+                                                                 DataPerReadB>{};
 
-#pragma unroll
-        // loop over k
-        for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
-        {
-#pragma unroll
-            // copy A-sub to form A
-            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
-            {
-                threadwise_matrix_copy(
-                    a_block_mtx,
-                    p_a_block +
-                        a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) +
-                        mMyThreadOffsetA,
-                    a_thread_mtx,
-                    p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC),
-                    a_thread_sub_mtx.GetLengths(),
-                    Number<DataPerReadA>{});
-            }
+        constexpr auto threadwise_gemm =
+            ThreadwiseGemmTransANormalBNormalC<decltype(a_thread_sub_mtx),
+                                               decltype(b_thread_sub_mtx),
+                                               decltype(c_thread_sub_mtx)>{};
+
+        const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA;
+        const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB;
+
+        // read A_sub_0
+        a_thread_copy.Run(p_a_block_off, p_a_thread);
+
+        // read B_sub_0
+        b_thread_copy.Run(p_b_block_off, p_b_thread);
+
+        // read B_sub_1
+        b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(0, NPerLevel1Cluster),
+                          p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC));
+
+        // read A_sub_1
+        a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(0, MPerLevel1Cluster),
+                          p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC));
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_gemm.Run(p_a_thread,
+                            p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC),
+                            p_c_thread + ThreadMatrixC::CalculateOffset(0, NPerThreadSubC));
 
 #pragma unroll
-            // copy B-sub to form B
-            for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
-            {
-                threadwise_matrix_copy(
-                    b_block_mtx,
-                    p_b_block +
-                        b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) +
-                        mMyThreadOffsetB,
-                    b_thread_mtx,
-                    p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC),
-                    b_thread_sub_mtx.GetLengths(),
-                    Number<DataPerReadB>{});
-            }
+        // loop over rest of k
+        for(index_t k = KPerThreadLoop; k < K; k += KPerThreadLoop)
+        {
+            // read A_sub_0
+            a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(k, 0), p_a_thread);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC),
+                                p_b_thread,
+                                p_c_thread + ThreadMatrixC::CalculateOffset(MPerThreadSubC, 0));
+
+            // read B_sub_0
+            b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(k, 0), p_b_thread);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC),
+                                p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC),
+                                p_c_thread +
+                                    ThreadMatrixC::CalculateOffset(MPerThreadSubC, NPerThreadSubC));
+
+            // read B_sub_1
+            b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(k, NPerLevel1Cluster),
+                              p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC));
+
+            // read A_sub_1
+            a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(k, MPerLevel1Cluster),
+                              p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC));
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_gemm.Run(p_a_thread,
+                                p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC),
+                                p_c_thread + ThreadMatrixC::CalculateOffset(0, NPerThreadSubC));
+        }
 
-            // C = A * B
-            threadwise_gemm(a_thread_mtx,
-                            True,
-                            p_a_thread,
-                            b_thread_mtx,
-                            False,
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC),
                             p_b_thread,
-                            c_thread_mtx,
-                            False,
-                            p_c_thread);
-        }
-    }
+                            p_c_thread + ThreadMatrixC::CalculateOffset(MPerThreadSubC, 0));
 
-    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run(const FloatA* __restrict__ p_a_block,
-                        const FloatB* __restrict__ p_b_block,
-                        FloatC* __restrict__ p_c_thread) const
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_gemm.Run(p_a_thread + a_thread_mtx.CalculateOffset(0, MPerThreadSubC),
+                            p_b_thread + b_thread_mtx.CalculateOffset(0, NPerThreadSubC),
+                            p_c_thread +
+                                ThreadMatrixC::CalculateOffset(MPerThreadSubC, NPerThreadSubC));
+    }
 
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
     {
-#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-        Run_amd_asm_v2(p_a_block, p_b_block, p_c_thread);
+#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
+        constexpr index_t MPerThread = ThreadMatrixC::NRow();
+        constexpr index_t NPerThread = ThreadMatrixC::NCol();
+
+        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
+        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
+
+        static_if<MRepeat == 2 && NRepeat == 2>{}([&](auto) {
+            Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
+        }).Else([&](auto) { Run_naive(p_a_block, p_b_block, p_c_thread); });
 #else
-        Run_source(p_a_block, p_b_block, p_c_thread);
+        Run_naive(p_a_block, p_b_block, p_c_thread);
 #endif
     }
 };
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index fb1540a98..7fe069a88 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -3,102 +3,157 @@
 
 #include "common_header.hpp"
 #include "ConstantMatrixDescriptor.hpp"
+#include "math.hpp"
 
 namespace ck {
 
-template <class Float, class Matrix>
+template <typename Float, class Matrix>
 __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
 {
     for(index_t i = 0; i < Matrix::NRow(); ++i)
     {
         for(index_t j = 0; j < Matrix::NCol(); ++j)
         {
-            const index_t id = Matrix::GetOffsetFromMultiIndex(i, j);
+            const index_t id = Matrix::CalculateOffset(i, j);
             p_thread[id]     = Float(0);
         }
     }
 }
 
-template <class Float,
-          class SrcMatrix,
-          class DstMatrix,
-          index_t NRow,
-          index_t NCol,
-          index_t DataPerRead>
-__device__ void threadwise_matrix_copy(SrcMatrix,
-                                       const Float* __restrict__ p_src,
-                                       DstMatrix,
-                                       Float* __restrict__ p_dst,
-                                       Sequence<NRow, NCol>,
-                                       Number<DataPerRead>)
+template <typename SrcMatrix,
+          typename DstMatrix,
+          index_t NSliceRow,
+          index_t NSliceCol,
+          index_t DataPerAccess>
+struct ThreadwiseMatrixSliceCopy
 {
-    static_assert(NCol % DataPerRead == 0, "wrong! should be NCol % == DataPerRead == 0");
-
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-
-    constexpr auto src_mtx = SrcMatrix{};
-    constexpr auto dst_mtx = DstMatrix{};
+    __device__ constexpr ThreadwiseMatrixSliceCopy()
+    {
+        static_assert(SrcMatrix::RowStride() % DataPerAccess == 0 &&
+                          DstMatrix::RowStride() % DataPerAccess == 0,
+                      "wrong! wrong alignment");
+        static_assert(NSliceCol % DataPerAccess == 0,
+                      "wrong! should be NSliceCol % DataPerAccess == 0");
+    }
 
-    for(index_t i = 0; i < NRow; ++i)
+    template <typename Data>
+    __device__ static void Run(const Data* p_src, Data* p_dst)
     {
-        for(index_t j = 0; j < NCol; j += DataPerRead)
+        using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
+
+        for(index_t i = 0; i < NSliceRow; ++i)
         {
-            const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j);
-            const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j);
+            for(index_t j = 0; j < NSliceCol; j += DataPerAccess)
+            {
+                const index_t src_index = SrcMatrix::CalculateOffset(i, j);
+                const index_t dst_index = DstMatrix::CalculateOffset(i, j);
 
-            *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-                *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+                *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+                    *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+            }
         }
     }
-}
+};
 
-template <class MatrixA,
-          class MatrixB,
-          class MatrixC,
-          bool TransA,
-          bool TransB,
-          bool TransC,
-          class FloatA,
-          class FloatB,
-          class FloatC>
-__device__ void threadwise_gemm(MatrixA,
-                                integral_constant<bool, TransA>,
-                                const FloatA* __restrict__ p_a_thread,
-                                MatrixB,
-                                integral_constant<bool, TransB>,
-                                const FloatB* __restrict__ p_b_thread,
-                                MatrixC,
-                                integral_constant<bool, TransC>,
-                                FloatC* __restrict__ p_c_thread)
+// C += transpose(A) * B
+//   Element of matrix can be vectorized data
+template <typename MatrixA, typename MatrixB, typename MatrixC>
+struct ThreadwiseGemmTransANormalBNormalC
 {
-    static_if<TransA && (!TransB) && (!TransC)>{}([&](auto) {
-        constexpr auto a_mtx = MatrixA{};
-        constexpr auto b_mtx = MatrixB{};
-        constexpr auto c_mtx = MatrixC{};
+    __device__ constexpr ThreadwiseGemmTransANormalBNormalC()
+    {
+        static_assert(MatrixA::NRow() == MatrixB::NRow() && MatrixA::NCol() == MatrixC::NRow() &&
+                          MatrixB::NCol() == MatrixC::NCol(),
+                      "wrong!");
+    }
 
-        constexpr index_t M = c_mtx.NRow();
-        constexpr index_t N = c_mtx.NCol();
-        constexpr index_t K = a_mtx.NRow(); // A is transposed
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed
 
         for(index_t k = 0; k < K; ++k)
         {
-            for(index_t i = 0; i < M; ++i)
+            for(index_t m = 0; m < M; ++m)
             {
-                for(index_t j = 0; j < N; ++j)
+                for(index_t n = 0; n < N; ++n)
                 {
-                    const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed
-                    const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j);
-                    const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j);
+                    const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
+                    const index_t bindex = MatrixB::CalculateOffset(k, n);
+                    const index_t cindex = MatrixC::CalculateOffset(m, n);
 
-                    p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex];
+                    p_c[cindex] +=
+                        math::inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
                 }
             }
         }
-    }).Else([&](auto fwd) {
-        // not implemented
-        static_assert(fwd(false), "wrong! support for this config is not implemented");
-    });
-}
+    }
+
+#if CK_USE_AMD_INLINE_ASM
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+        constexpr index_t M = MatrixC::NRow();
+        constexpr index_t N = MatrixC::NCol();
+        constexpr index_t K = MatrixA::NRow(); // A is transposed
+
+        static_assert(N == 4 || N == 2, "wrong! not supported by asm yet");
+
+        for(index_t k = 0; k < K; ++k)
+        {
+            for(index_t m = 0; m < M; ++m)
+            {
+                const index_t aindex = MatrixA::CalculateOffset(k, m); // A is transposed
+
+                static_if<N == 2>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+
+                    __outer_product_1x2(
+                        p_a[aindex], p_b[bindex_0], p_b[bindex_1], p_c[cindex_0], p_c[cindex_1]);
+                });
+
+                static_if<N == 4>{}([&](auto) {
+                    const index_t bindex_0 = MatrixB::CalculateOffset(k, 0);
+                    const index_t bindex_1 = MatrixB::CalculateOffset(k, 1);
+                    const index_t bindex_2 = MatrixB::CalculateOffset(k, 2);
+                    const index_t bindex_3 = MatrixB::CalculateOffset(k, 3);
+
+                    const index_t cindex_0 = MatrixC::CalculateOffset(m, 0);
+                    const index_t cindex_1 = MatrixC::CalculateOffset(m, 1);
+                    const index_t cindex_2 = MatrixC::CalculateOffset(m, 2);
+                    const index_t cindex_3 = MatrixC::CalculateOffset(m, 3);
+
+                    __outer_product_1x4(p_a[aindex],
+                                        p_b[bindex_0],
+                                        p_b[bindex_1],
+                                        p_b[bindex_2],
+                                        p_b[bindex_3],
+                                        p_c[cindex_0],
+                                        p_c[cindex_1],
+                                        p_c[cindex_2],
+                                        p_c[cindex_3]);
+                });
+            }
+        }
+    }
+#endif
+
+    template <typename FloatA, typename FloatB, typename FloatC>
+    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    {
+#if CK_USE_AMD_INLINE_ASM && CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+        Run_amd_asm(p_a, p_b, p_c);
+#else
+        Run_source(p_a, p_b, p_c);
+#endif
+    }
+};
 
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index 0a17b4bd3..2d175852e 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -3,82 +3,32 @@
 
 #include "vector_type.hpp"
 
+// disable inline asm due to the compiler issue: SWDEV-202749
+#define WORKAROUND_SWDEV_202749 1
+
 namespace ck {
 
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
 
-__device__ void vmcnt(index_t cnt)
-{
-    if(cnt == 0)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(0) \n \
-                " ::);
-    }
-    else if(cnt == 1)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(1) \n \
-                " ::);
-    }
-    else if(cnt == 2)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(2) \n \
-                " ::);
-    }
-    else if(cnt == 4)
-    {
-        asm volatile("\n \
-                s_waitcnt vmcnt(2) \n \
-                " ::);
-    }
-    else
-    {
-        assert(false);
-    }
-}
-
-__device__ void lgkmcnt(index_t cnt)
+__device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
 {
-    if(cnt == 0)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(0) \n \
-                " ::);
-    }
-    else if(cnt == 1)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(1) \n \
-                " ::);
-    }
-    else if(cnt == 2)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(2) \n \
-                " ::);
-    }
-    else if(cnt == 3)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(3) \n \
-                " ::);
-    }
-    else if(cnt == 4)
-    {
-        asm volatile("\n \
-                s_waitcnt lgkmcnt(4) \n \
-                " ::);
-    }
-    else
-    {
-        assert(false);
-    }
+///\to-do: enable the inline asm after the compiler fix
+#if WORKAROUND_SWDEV_202749
+    c0 += a * b0;
+    c1 += a * b1;
+#else
+    asm volatile("\n \
+            v_mac_f32 %0, %2, %3 \n \
+            v_mac_f32 %1, %2, %4 \n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
+#endif
 }
 
-__device__ void outerProduct1x4(const float* a, const float* b, float* c)
+__device__ void __outer_product_1x4(
+    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
 {
     asm volatile("\n \
             v_mac_f32 %0, %4, %5 \n \
@@ -86,596 +36,8 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c)
             v_mac_f32 %2, %4, %7 \n \
             v_mac_f32 %3, %4, %8 \n \
             "
-                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
-                 : "v"(a[0]),
-                   "v"(b[0]),
-                   "v"(b[1]),
-                   "v"(b[2]),
-                   "v"(b[3]),
-                   "0"(c[0]),
-                   "1"(c[1]),
-                   "2"(c[2]),
-                   "3"(c[3]));
-}
-
-__device__ void outerProduct1x4(const float& a,
-                                const vector_type<float, 4>::MemoryType& b,
-                                vector_type<float, 4>::MemoryType& c)
-{
-    outerProduct1x4(&a, reinterpret_cast<const float*>(&b), reinterpret_cast<float*>(&c));
-}
-
-__device__ void outerProduct2x4(const vector_type<float, 2>::MemoryType& a,
-                                const vector_type<float, 4>::MemoryType& b,
-                                vector_type<float, 4>::MemoryType& c0,
-                                vector_type<float, 4>::MemoryType& c1)
-{
-    outerProduct1x4(a.x, b, c0);
-    outerProduct1x4(a.y, b, c1);
-}
-
-__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
-                                const vector_type<float, 4>::MemoryType& b,
-                                vector_type<float, 4>::MemoryType& c0,
-                                vector_type<float, 4>::MemoryType& c1,
-                                vector_type<float, 4>::MemoryType& c2,
-                                vector_type<float, 4>::MemoryType& c3)
-{
-    outerProduct1x4(a.x, b, c0);
-    outerProduct1x4(a.y, b, c1);
-    outerProduct1x4(a.z, b, c2);
-    outerProduct1x4(a.w, b, c3);
-}
-
-__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
-                                const vector_type<float, 4>::MemoryType* b,
-                                vector_type<float, 4>::MemoryType* c)
-{
-    outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
-    outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
-    outerProduct4x4(a[1], b[0], c[8], c[10], c[12], c[14]);
-    outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
-}
-
-__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
-{
-    if(offset == 0)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:0\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 64)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:64\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 128)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:128\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 192)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:192\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 256)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:256\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 320)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:320\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 384)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:384\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 448)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:448\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 512)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:512\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 576)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:576\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 640)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:640\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 704)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:704\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 768)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:768\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 832)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:832\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 896)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:896\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 960)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:960\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1024)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1024\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1088)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1088\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1152)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1152\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1216)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1216\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1280)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1280\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1344)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1344\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1408)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1408\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1472)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1472\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1536)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1536\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1600)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1600\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1664)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1664\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1728)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1728\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1792)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1792\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1856)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1856\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1920)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1920\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 1984)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:1984\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2048)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2048\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2112)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2112\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2176)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2176\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2240)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2240\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2304)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2304\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2368)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2368\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2432)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2432\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2496)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2496\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2560)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2560\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2624)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2624\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2688)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2688\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2752)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2752\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2816)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2816\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2880)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2880\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 2944)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:2944\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3008)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3008\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3072)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3072\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3136)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3136\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3200)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3200\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3264)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3264\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3328)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3328\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3392)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3392\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3456)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3456\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3520)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3520\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3584)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3584\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3648)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3648\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3712)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3712\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3776)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3776\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3840)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3840\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3904)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3904\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 3968)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:3968\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 4032)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:4032\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-    if(offset == 4096)
-    {
-        asm volatile("\n \
-                ds_read_b128 %0, %1 offset:4096\n \
-                "
-                     : "=v"(r)
-                     : "v"(__to_local(lds)));
-    }
-}
-
-__device__ void
-ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
-{
-    if(offset == 0)
-    {
-        asm volatile("\n \
-            ds_write_b128 %0, %1 \n \
-            "
-                     :
-                     : "v"(__to_local(lds)), "v"(r));
-    }
-    else
-    {
-        assert(false);
-    }
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/utility/bfloat16_dev.hpp b/composable_kernel/include/utility/bfloat16_dev.hpp
new file mode 100644
index 000000000..52d00346c
--- /dev/null
+++ b/composable_kernel/include/utility/bfloat16_dev.hpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef BFLOAT16_DEVICE_HPP
+#define BFLOAT16_DEVICE_HPP
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+#define EXECUTION_SPECIFIER __device__
+#else
+#define EXECUTION_SPECIFIER
+#endif // MIOPEN_BACKEND_HIP
+
+typedef union
+{
+    uint u32;
+    ushort2 ushortx2;
+
+// Composable kernels are written in HIP language. The language doesnt support
+// ushort2.hi or ushort2.low.
+#ifdef __HIP_PLATFORM_HCC__
+    ushort ushortvec[2];
+#endif // MIOPEN_BACKEND_HIP
+    float f32;
+} cvt_bf16_fp32_t;
+
+EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val)
+{
+    cvt_bf16_fp32_t target_val;
+
+#ifdef __HIP_PLATFORM_HCC__
+    target_val.ushortx2 = make_ushort2(0, src_val);
+#else
+    target_val.ushortx2 = (ushort2)(0, src_val);
+#endif
+
+    return target_val.f32;
+}
+
+EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
+{
+    cvt_bf16_fp32_t target_val;
+    target_val.f32 = src_val;
+    // BF16 round and NaN preservation code matches
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+    if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN
+    {
+        // When all of the exponent bits are 1, the value is Inf or NaN.
+        // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+        // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+        // bit being 1. Signaling NaN is indicated by the most significant
+        // mantissa bit being 0 but some other bit(s) being 1. If any of the
+        // lower 16 bits of the mantissa are 1, we set the least significant bit
+        // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+        // the bloat16's mantissa bits are all 0.
+        if((target_val.u32 & 0xffff) != 0)
+        {
+            target_val.u32 |= 0x10000; // Preserve signaling NaN
+        }
+    }
+    else
+    {
+#ifdef MIOPEN_USE_RNE_BFLOAT16
+// When the exponent bits are not all 1s, then the value is zero, normal,
+// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+// This causes the bfloat16's mantissa to be incremented by 1 if the 16
+// least significant bits of the float mantissa are greater than 0x8000,
+// or if they are equal to 0x8000 and the least significant bit of the
+// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+// has the value 0x7f, then incrementing it causes it to become 0x00 and
+// the exponent is incremented by one, which is the next higher FP value
+// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+// incrementing it causes it to become an exponent of 0xFF and a mantissa
+// of 0x00, which is Inf, the next higher value to the unrounded value.
+#ifdef __HIP_PLATFORM_HCC__
+        target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1));
+#else
+        target_val.u32 +=
+            (0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even
+#endif // MIOPEN_BACKEND_HIP
+#endif // MIOPEN_USE_RNE_BFLOAT16
+    }
+
+#ifdef __HIP_PLATFORM_HCC__
+    return target_val.ushortvec[1];
+#else
+    return target_val.ushortx2.hi;
+#endif // MIOPEN_BACKEND_HIP
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BFLOAT16_DEVICE_HPP
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index e603ffcf0..971d1b35b 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -3,12 +3,24 @@
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#include "bfloat16_dev.hpp"
 
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+
+// device backend
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 1
+
+// AMD inline asm
 #define CK_USE_AMD_INLINE_ASM 1
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
+
+// AMD intrinsic
+#define CK_USE_AMD_INTRINSIC 1
 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
+
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -32,20 +44,56 @@ using index_t = int32_t;
 // instruction
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef float float32_t __attribute__((ext_vector_type(32)));
 
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
 
+// half
+typedef half2 half2_t;
+
+typedef struct
+{
+    // TODO: why not use "half scalar[4]"?
+    half2_t scalar[2];
+} half4_t;
+
+// bfloat16: use ushort
+typedef struct
+{
+    ushort scalar[2];
+} ushort2_t;
+
+typedef struct
+{
+    // TODO: why not use "ushort scalar[4]"?
+    ushort2_t scalar[2];
+} ushort4_t;
+
 // data type conversion
-template <typename T>
+template <class T>
 struct type_convert
 {
-    template <typename X>
-    __device__ T operator()(const X& x) const
+    template <class X>
+    __device__ T operator()(X x) const
     {
         return static_cast<T>(x);
     }
 };
 
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+
 } // namespace ck
 
 #endif
diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp
index ba70e7ab2..f6c41cc52 100644
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -117,6 +117,68 @@ struct less
     __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
 };
 
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
+    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(ushort4_t a, ushort4_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+};
+
 } // namespace math
 } // namspace ck
 

From af1cb272cfb76e075a9ca2d39cda26fb1b936869 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 17:53:41 -0500
Subject: [PATCH 03/20] clean up

---
 .../include/tensor_operation/blockwise_gemm.hpp            | 7 +++----
 .../include/tensor_operation/threadwise_gemm.hpp           | 2 +-
 composable_kernel/include/utility/config_amd.hpp.in        | 3 ++-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index 71245a7a9..cd04c4550 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -7,7 +7,9 @@
 
 namespace ck {
 
-// if following number are power of 2, index calculation shall be greatly reduced:
+// blockwise GEMM: C += transpose(A) * B
+// A and B are visable to the whole block, C is distributed among each thread
+// If following number are power of 2, index calculation shall be greatly reduced:
 //    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
 //    MLevel1ThreadCluster, NLevel1ThreadCluster
 template <index_t BlockSize,
@@ -117,9 +119,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     __device__ void
     Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
     {
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
-
         constexpr auto a_block_mtx  = BlockMatrixA{};
         constexpr auto b_block_mtx  = BlockMatrixB{};
         constexpr auto c_thread_mtx = ThreadMatrixC{};
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index 7fe069a88..503eb9522 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -99,7 +99,7 @@ struct ThreadwiseGemmTransANormalBNormalC
         constexpr index_t N = MatrixC::NCol();
         constexpr index_t K = MatrixA::NRow(); // A is transposed
 
-        static_assert(N == 4 || N == 2, "wrong! not supported by asm yet");
+        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
 
         for(index_t k = 0; k < K; ++k)
         {
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 971d1b35b..799d5f8a9 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -41,7 +41,8 @@ using index_t = int32_t;
 #endif
 
 // For some reason, HIP compiler need this definition to generate optimal load and store
-// instruction
+//   instruction
+// float
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));
 typedef float float32_t __attribute__((ext_vector_type(32)));

From a0806d0e6b43a24e4847f976ffb5492766980628 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 18:41:41 -0500
Subject: [PATCH 04/20] miopen integration

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  24 +-
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp |  28 +-
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp |  20 +-
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp |  28 +-
 .../tensor_operation/blockwise_gemm.hpp       |   7 -
 .../blockwise_generic_tensor_slice_copy.hpp   |  16 +-
 ...e_generic_tensor_slice_copy_deprecated.hpp |  20 +-
 .../tensor_operation/threadwise_gemm.hpp      |  13 +-
 .../threadwise_generic_tensor_slice_copy.hpp  |  32 +--
 ...e_generic_tensor_slice_copy_deprecated.hpp | 261 +-----------------
 .../include/utility/amd_inline_asm.hpp        |  96 ++++++-
 .../include/utility/config_amd.hpp.in         |  64 +++--
 .../include/utility/config_nvidia.hpp.in      |   4 +-
 13 files changed, 222 insertions(+), 391 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index ea1412064..53366f79d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -265,10 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
-                                                                                 p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
-                p_wei_global, p_wei_block_double);
+            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
+                                                                               p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
+                                                                                p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -299,12 +299,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_in_global, p_in_thread_buffer);
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -327,9 +325,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -398,7 +396,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                     0,
                     b_thread_data_on_global,
                     0})
-                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>(
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>(
                     p_out_thread, p_out_global);
         }
     }
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index a5a753158..d5d1e496b 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -281,10 +281,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
-                                                                                 p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
-                p_wei_global, p_wei_block_double);
+            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
+                                                                               p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
+                                                                                p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -315,12 +315,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_in_global, p_in_thread_buffer);
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -343,9 +341,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -427,12 +425,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                       b_thread_data_on_global,
                                                       0})
 #if 1
-                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
 #else // tweaking
                 .template Run_optimized_dst_address_calculation<Float,
                                                                 Float,
-                                                                AddressSpace_t::generic,
-                                                                AddressSpace_t::global>
+                                                                AddressSpace::generic,
+                                                                AddressSpace::global>
 #endif
                 (p_out_thread, p_out_global);
         }
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 55ca61926..39a28e391 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -251,10 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, AddressSpace_t::global>(p_in_global,
-                                                                          p_in_block_double);
-            blockwise_wei_copy.template Run<Float, AddressSpace_t::global>(p_wei_global,
-                                                                           p_wei_block_double);
+            blockwise_in_copy.template Run<Float, AddressSpace::global>(p_in_global,
+                                                                        p_in_block_double);
+            blockwise_wei_copy.template Run<Float, AddressSpace::global>(p_wei_global,
+                                                                         p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -285,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                     p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                     p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -311,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -391,8 +391,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
             {
                 threadwise_out_copy
-                    .template Run<Float, AddressSpace_t::generic, AddressSpace_t::global>(
-                        p_out_thread, p_out_global);
+                    .template Run<Float, AddressSpace::generic, AddressSpace::global>(p_out_thread,
+                                                                                      p_out_global);
 
                 threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
                 threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index d39e11de2..e93258682 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -255,10 +255,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace_t::global>(p_in_global,
-                                                                                 p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace_t::global>(
-                p_wei_global, p_wei_block_double);
+            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
+                                                                               p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
+                                                                                p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -289,12 +289,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy
-                    .template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
-                        p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_in_global, p_in_thread_buffer);
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -317,9 +315,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -390,12 +388,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                                                b_thread_data_on_global / B1,
                                                b_thread_data_on_global % B1})
 #if 1
-                .template Run<Float, Float, AddressSpace_t::generic, AddressSpace_t::global>
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
 #else // tweaking
                 .template Run_optimized_dst_address_calculation<Float,
                                                                 Float,
-                                                                AddressSpace_t::generic,
-                                                                AddressSpace_t::global>
+                                                                AddressSpace::generic,
+                                                                AddressSpace::global>
 #endif
                 (p_out_thread, p_out_global);
         }
diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
index cd04c4550..1c7bb92f6 100644
--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -143,13 +143,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr auto b_thread_mtx =
             make_ConstantMatrixDescriptor_packed(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
-        // thread A-sub, B-sub for copy
-        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
-
-        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
-
         FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
         FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 69e98d4c8..15faeaebf 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -68,8 +68,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename BlockSrcData,
               typename ThreadBufferData,
-              AddressSpace_t BlockSrcAddressSpace     = AddressSpace_t::generic,
-              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic>
+              AddressSpace BlockSrcAddressSpace     = AddressSpace::generic,
+              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic>
     __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
                                         ThreadBufferData* p_thread_buffer) const
     {
@@ -89,8 +89,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename ThreadBufferData,
               typename BlockDstData,
-              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t BlockDstAddressSpace     = AddressSpace_t::generic>
+              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic,
+              AddressSpace BlockDstAddressSpace     = AddressSpace::generic>
     __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
                                          BlockDstData* p_block_dst) const
     {
@@ -110,8 +110,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename BlockSrcData,
               typename BlockDstData,
-              AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic>
+              AddressSpace BlockSrcAddressSpace = AddressSpace::generic,
+              AddressSpace BlockDstAddressSpace = AddressSpace::generic>
     __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
     {
         BlockSrcData p_thread_buffer[GetThreadBufferSize()];
@@ -119,12 +119,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
         RunLoadThreadBuffer<BlockSrcData,
                             BlockSrcData,
                             BlockSrcAddressSpace,
-                            AddressSpace_t::generic>(p_block_src, p_thread_buffer);
+                            AddressSpace::generic>(p_block_src, p_thread_buffer);
 
         // if there is type conversion, it's done during store
         RunStoreThreadBuffer<BlockSrcData,
                              BlockDstData,
-                             AddressSpace_t::generic,
+                             AddressSpace::generic,
                              BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
     }
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index 881a88771..ca3902039 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -7,10 +7,6 @@
 #include "tensor_coordinate_deprecated.hpp"
 #include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#endif
-
 namespace ck {
 
 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
@@ -480,8 +476,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t BlockSrcAddressSpace     = AddressSpace_t::generic,
-              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic>
+              AddressSpace BlockSrcAddressSpace     = AddressSpace::generic,
+              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic>
     __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
     {
         mThreadwiseLoad
@@ -491,8 +487,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t ThreadBufferAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t BlockDstAddressSpace     = AddressSpace_t::generic>
+              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic,
+              AddressSpace BlockDstAddressSpace     = AddressSpace::generic>
     __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
     {
         mThreadwiseStore
@@ -502,17 +498,17 @@ struct BlockwiseGenericTensorSliceCopy_v2
 
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t BlockSrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t BlockDstAddressSpace = AddressSpace_t::generic>
+              AddressSpace BlockSrcAddressSpace = AddressSpace::generic,
+              AddressSpace BlockDstAddressSpace = AddressSpace::generic>
     __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
     {
         SrcData p_thread_buffer[GetThreadBufferSize()];
 
-        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, AddressSpace_t::generic>(
+        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, AddressSpace::generic>(
             p_block_src, p_thread_buffer);
 
         // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<SrcData, DstData, AddressSpace_t::generic, BlockDstAddressSpace>(
+        RunStoreThreadBuffer<SrcData, DstData, AddressSpace::generic, BlockDstAddressSpace>(
             p_thread_buffer, p_block_dst);
     }
 
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index 503eb9522..0619aaf15 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -91,7 +91,7 @@ struct ThreadwiseGemmTransANormalBNormalC
         }
     }
 
-#if CK_USE_AMD_INLINE_ASM
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
     template <typename FloatA, typename FloatB, typename FloatC>
     __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
     {
@@ -147,8 +147,15 @@ struct ThreadwiseGemmTransANormalBNormalC
     template <typename FloatA, typename FloatB, typename FloatC>
     __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
     {
-#if CK_USE_AMD_INLINE_ASM && CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-        Run_amd_asm(p_a, p_b, p_c);
+#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+        constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
+                                     ((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
+                                      (is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
+                                      (is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
+
+        static_if<has_amd_asm>{}([&](auto fwd) {
+            Run_amd_asm(p_a, p_b, fwd(p_c));
+        }).Else([&](auto) { Run_source(p_a, p_b, p_c); });
 #else
         Run_source(p_a, p_b, p_c);
 #endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 0bd147d1b..6a61c2c05 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -10,8 +10,8 @@
 #define CK_USE_AMD_INTRINSIC 1
 #endif
 
-#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
+#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1
 #endif
 
 namespace ck {
@@ -76,8 +76,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on dst data: No write if dst data is in paddin area.
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
+              AddressSpace SrcAddressSpace = AddressSpace::generic,
+              AddressSpace DstAddressSpace = AddressSpace::generic>
     __device__ void Run(const SrcData* p_src, DstData* p_dst) const
     {
         using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
@@ -126,8 +126,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 //   the same padding situation
                 if(src_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                    static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                         *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                             __buffer_load<SrcData, SrcDataPerAccess>(
                                 p_src, src_coord.GetOffset(), 0);
@@ -167,8 +167,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 //   the same padding situation
                 if(dst_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                         __buffer_store<DstData, DstDataPerAccess>(
                             *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                             p_dst,
@@ -204,8 +204,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // This version is optimized for address calculation of src tensor
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
+              AddressSpace SrcAddressSpace = AddressSpace::generic,
+              AddressSpace DstAddressSpace = AddressSpace::generic>
     __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
                                                           DstData* p_dst) const
     {
@@ -302,8 +302,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     //   the src vector has the same padding situation
                     if(src_coord.IsUpperIndexMappedToValidOffset())
                     {
-                        static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                             *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                                 __buffer_load<SrcData, SrcDataPerAccess>(
                                     p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
@@ -362,8 +362,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // This version is optimized for address calculation of dst tensor
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
+              AddressSpace SrcAddressSpace = AddressSpace::generic,
+              AddressSpace DstAddressSpace = AddressSpace::generic>
     __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
                                                           DstData* p_dst) const
     {
@@ -491,8 +491,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     //   the dst vector has the same padding situation
                     if(dst_coord.IsUpperIndexMappedToValidOffset())
                     {
-                        static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                        static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                             __buffer_store<DstData, DstDataPerAccess>(
                                 *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                                 p_dst,
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index 78684abe9..9f6133f8d 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -6,257 +6,8 @@
 #include "ConstantMergedTensorDescriptor.hpp"
 #include "tensor_coordinate_deprecated.hpp"
 
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
-#endif
-
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
-#endif
-
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
-#endif
-
-#ifndef CK_USE_AMD_INTRINSIC
-#define CK_USE_AMD_INTRINSIC 1
-#endif
-
-#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
-#endif
-
 namespace ck {
 
-// This threadwise copy allow vector access of src and dst.
-// It allows the dimensions of vector access to be different on src and dst.
-// It also allows the vector size to be different on src and dst.
-// It also allows order of access to be different on src and dst.
-// It use register as buffer to hold all data moving from src to dst.
-// It is designed for copying small amount of data, and src and dst are
-// device memory or LDS.
-// When copying large amout of data, let's hope compiler will reduce register
-// used for the buffer.
-template <typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1r1
-{
-    static constexpr index_t nDim = SliceLengths::GetSize();
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
-        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SrcDimAccessOrder::GetSize() &&
-                          nDim == DstDimAccessOrder::GetSize(),
-                      "wrong! # of dimensions not the same");
-
-        static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong! map is not valid");
-
-        static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
-                          SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
-                      "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-        constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
-                     SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-
-        static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
-                     DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-    }
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
-        : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
-    }
-
-    __device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
-    {
-        mDstSliceOrigin = dst_slice_origin;
-    }
-
-    template <typename TData>
-    __device__ void Run(const TData* p_src, TData* p_dst) const
-    {
-        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
-
-        TData p_buffer_[buffer_desc.GetElementSpace()];
-        TData* p_buffer = p_buffer_;
-
-        // copy data from src into buffer
-        {
-            using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
-
-            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
-
-            constexpr auto src_access_lengths = SliceLengths::Modify(
-                src_vector_access_dim,
-                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-            static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
-                constexpr auto src_data_begin_id = src_access_id.Modify(
-                    src_vector_access_dim,
-                    src_access_id[src_vector_access_dim] * src_data_per_access);
-
-                const index_t src_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
-
-                // load vector from src
-                const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-
-                // unpack vector into buffer
-                static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
-                    constexpr auto scalar_id =
-                        typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
-                                                                              i);
-
-                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
-
-                    p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
-                });
-            });
-#else
-            ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
-                auto src_data_begin_id = src_access_id;
-                src_data_begin_id(src_vector_access_dim) =
-                    src_access_id[src_vector_access_dim] * src_data_per_access;
-
-                const index_t src_offset =
-                    SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
-
-                // load vector from src
-                const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-
-                // unpack vector into buffer
-                for(index_t i = 0; i < SrcDataPerAccess; ++i)
-                {
-                    auto scalar_id                   = make_zero_array<index_t, nDim>();
-                    scalar_id(src_vector_access_dim) = i;
-
-                    const index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
-
-                    p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
-                }
-            });
-#endif
-        }
-
-        // copy data from buffer to dst
-        {
-            using vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
-
-            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
-
-            constexpr auto dst_access_lengths = SliceLengths::Modify(
-                dst_vector_access_dim,
-                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
-            static_ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
-                constexpr auto dst_data_begin_id = dst_access_id.Modify(
-                    dst_vector_access_dim,
-                    dst_access_id[dst_vector_access_dim] * dst_data_per_access);
-
-                vector_t vector_data{};
-
-                // pack vector from buffer
-                static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
-                    constexpr auto scalar_id =
-                        typename uniform_sequence_gen<nDim, 0>::type{}.Modify(dst_vector_access_dim,
-                                                                              i);
-
-                    constexpr index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
-
-                    reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
-                });
-
-                const index_t dst_offset =
-                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
-
-                // store vector into dst
-                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
-            });
-#else
-            ford<decltype(dst_access_lengths), DstDimAccessOrder>{}([&](auto dst_access_id) {
-                auto dst_data_begin_id = dst_access_id;
-                dst_data_begin_id(dst_vector_access_dim) =
-                    dst_access_id[dst_vector_access_dim] * dst_data_per_access;
-
-                vector_t vector_data{};
-
-                // pack vector from buffer
-                for(index_t i = 0; i < DstDataPerAccess; ++i)
-                {
-                    auto scalar_id                   = make_zero_array<index_t, nDim>();
-                    scalar_id(dst_vector_access_dim) = i;
-
-                    const index_t buffer_offset =
-                        buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id);
-
-                    reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
-                }
-
-                const index_t dst_offset =
-                    DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id);
-
-                // store vector into dst
-                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) = vector_data;
-            });
-#endif
-        }
-    }
-
-    private:
-    Array<index_t, nDim> mSrcSliceOrigin;
-    Array<index_t, nDim> mDstSliceOrigin;
-};
-
 // This threadwise copy allow vector access of src and dst.
 // It allows the vector size to be different on src and dst.
 // The dimensions of vector access should be the same on src and dst.
@@ -539,8 +290,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
 
     template <typename SrcData,
               typename DstData,
-              AddressSpace_t SrcAddressSpace = AddressSpace_t::generic,
-              AddressSpace_t DstAddressSpace = AddressSpace_t::generic>
+              AddressSpace SrcAddressSpace = AddressSpace::generic,
+              AddressSpace DstAddressSpace = AddressSpace::generic>
     __device__ void Run(const SrcData* p_src, DstData* p_dst) const
     {
         constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
@@ -613,8 +364,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                         //     2. src_normal_offset must be calculatd at compile time (guaranteed by
                         //        algorithm)
                         //     3. src_merged_offset can be runtime value (no assumption imposed)
-                        static_if<SrcAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                             vector_data = __buffer_load<SrcData, SrcDataPerAccess>(
                                 p_src, src_merged_offset, src_normal_offset);
 #else
@@ -722,8 +473,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
                     //     2. dst_normal_offset must be calculatd at compile time (guaranteed by
                     //        algorithm)
                     //     3. dst_merged_offset can be runtime value (no assumption imposed)
-                    static_if<DstAddressSpace == AddressSpace_t::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
                         __buffer_store<SrcData, DstDataPerAccess>(
                             vector_data, p_dst, dst_merged_offset, dst_normal_offset);
 #else
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index 2d175852e..c764b27d2 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -3,18 +3,14 @@
 
 #include "vector_type.hpp"
 
-// disable inline asm due to the compiler issue: SWDEV-202749
-#define WORKAROUND_SWDEV_202749 1
-
 namespace ck {
 
-// cast a pointer of LDS to its address
-extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
-
+// outer-product: c[i,j] += inner_product(a[i], b[j])
 __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
 {
+// disable inline asm due to the compiler issue: SWDEV-202749
 ///\to-do: enable the inline asm after the compiler fix
-#if WORKAROUND_SWDEV_202749
+#if CK_WORKAROUND_SWDEV_202749
     c0 += a * b0;
     c1 += a * b1;
 #else
@@ -27,6 +23,7 @@ __device__ void __outer_product_1x2(float a, float b0, float b1, float& c0, floa
 #endif
 }
 
+// outer-product: c[i,j] += inner_product(a[i], b[j])
 __device__ void __outer_product_1x4(
     float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
 {
@@ -40,5 +37,90 @@ __device__ void __outer_product_1x4(
                  : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
 }
 
+// outer-product: c[i,j] += inner_product(a[i], b[j])
+__device__ void __outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %3  %0\n \
+            v_dot2_f32_f16 %1, %2, %4  %1\n \
+            "
+                 : "=v"(c0), "=v"(c1) // Dest registers
+                 : "v"(a),            // 1st Src register for 1 half2 registers
+                   "v"(b0),           // 2nd Src register
+                   "v"(b1),
+                   "0"(c0), // 3rd Src register
+                   "1"(c1));
+}
+
+// outer-product: c[i,j] += inner_product(a[i], b[j])
+__device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
+{
+    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %4  %0\n \
+            v_dot2_f32_f16 %1, %2, %6  %1\n \
+            v_dot2_f32_f16 %0, %3, %5  %0\n \
+            v_dot2_f32_f16 %1, %3, %7  %1\n \
+            "
+                 : "=v"(c0), "=v"(c1) // Dest registers
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]), // 1st Src registers for 2 half2 registers
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]), // 2nd Src registers for 2 half2 registers
+                   "0"(c0),
+                   "1"(c1)); // 3rd Src Acc registers for 2 half2 registers
+}
+
+// outer-product: c[i,j] += inner_product(a[i], b[j])
+__device__ void __outer_product_1x4(half4_t a,
+                                    half4_t b0,
+                                    half4_t b1,
+                                    half4_t b2,
+                                    half4_t b3,
+                                    float& c0,
+                                    float& c1,
+                                    float& c2,
+                                    float& c3)
+{
+    const half2_t* p_a_half2  = reinterpret_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = reinterpret_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = reinterpret_cast<const half2_t*>(&b1);
+    const half2_t* p_b2_half2 = reinterpret_cast<const half2_t*>(&b2);
+    const half2_t* p_b3_half2 = reinterpret_cast<const half2_t*>(&b3);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %6  %0\n \
+            v_dot2_f32_f16 %1, %4, %8  %1\n \
+            v_dot2_f32_f16 %2, %4, %10 %2\n \
+            v_dot2_f32_f16 %3, %4, %12 %3\n \
+            v_dot2_f32_f16 %0, %5, %7  %0\n \
+            v_dot2_f32_f16 %1, %5, %9  %1\n \
+            v_dot2_f32_f16 %2, %5, %11 %2\n \
+            v_dot2_f32_f16 %3, %5, %13 %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]), // 1st Src registers for 2 half2 registers
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]), // 2nd Src registers for 2 half2 registers
+                   "v"(p_b2_half2[0]),
+                   "v"(p_b2_half2[1]),
+                   "v"(p_b3_half2[0]),
+                   "v"(p_b3_half2[1]), // 2nd Src registers for 2 half2 registers
+                   "0"(c0),
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3)); // 3rd Src Acc registers for 2 half2 registers
+}
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index 799d5f8a9..fe82ba992 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -12,12 +12,31 @@
 #define CK_DEVICE_BACKEND_AMD 1
 
 // AMD inline asm
+#ifndef CK_USE_AMD_INLINE_ASM
 #define CK_USE_AMD_INLINE_ASM 1
+#endif
+
+#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 1
+#endif
 
-// AMD intrinsic
+#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
+#endif
+
+// AMD llvm intrinsic
+#ifndef CK_USE_AMD_INTRINSIC
 #define CK_USE_AMD_INTRINSIC 1
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
+#endif
+
+#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1
+#endif
 
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
@@ -26,9 +45,12 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 
+// workaround
+#define CK_WORKAROUND_SWDEV_202749 1
+
 namespace ck {
 
-enum AddressSpace_t
+enum AddressSpace
 {
     generic,
     global
@@ -40,41 +62,28 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
 
-// For some reason, HIP compiler need this definition to generate optimal load and store
-//   instruction
+// For some reason, HIP compiler need this definition to generate optimal ISA
 // float
 typedef float float2_t __attribute__((ext_vector_type(2)));
 typedef float float4_t __attribute__((ext_vector_type(4)));
 typedef float float32_t __attribute__((ext_vector_type(32)));
 
-typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
+// float16
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
 
-// half
-typedef half2 half2_t;
+// bfloat16
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
 
-typedef struct
-{
-    // TODO: why not use "half scalar[4]"?
-    half2_t scalar[2];
-} half4_t;
-
-// bfloat16: use ushort
-typedef struct
-{
-    ushort scalar[2];
-} ushort2_t;
-
-typedef struct
-{
-    // TODO: why not use "ushort scalar[4]"?
-    ushort2_t scalar[2];
-} ushort4_t;
+// int
+typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
 
 // data type conversion
-template <class T>
+template <typename T>
 struct type_convert
 {
-    template <class X>
+    template <typename X>
     __device__ T operator()(X x) const
     {
         return static_cast<T>(x);
@@ -96,5 +105,4 @@ __device__ ushort type_convert<ushort>::operator()<float>(float x) const
 }
 
 } // namespace ck
-
 #endif
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 67cd93136..2eea4a867 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -10,7 +10,7 @@
 #define CK_DEVICE_BACKEND_NVIDIA 1
 #define CK_USE_AMD_INTRINSIC 0
 #define CK_USE_AMD_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
+#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -18,7 +18,7 @@
 
 namespace ck {
 
-enum AddressSpace_t
+enum AddressSpace
 {
     generic,
     global = generic

From db268ae0c88982dff4ac6bebb594f3c6463e3824 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 19:07:17 -0500
Subject: [PATCH 05/20] mark deprecated code

---
 ...e_convolution_direct_v2_nchw_kcyx_nkhw.hpp |  2 +-
 ...tion_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp |  2 +-
 ...tion_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp |  2 +-
 ...tion_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp | 92 ++++++++++---------
 ..._v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp | 44 ++++-----
 ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp |  2 +-
 ...tion_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp |  2 +-
 ..._v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp |  2 +-
 ...lution_implicit_gemm_v2_chwn_cyxk_khwn.hpp |  2 +-
 ...mm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp |  2 +-
 ...lution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp | 33 ++++---
 ...mm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp | 33 ++++---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 61 ++++++------
 ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 30 +++---
 ..._v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp | 30 +++---
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 83 +++++++++--------
 .../ConstantMatrixDescriptor.hpp              |  7 +-
 ...tantMergedTensorDescriptor_deprecated.hpp} | 18 ++--
 ...> ConstantTensorDescriptor_deprecated.hpp} | 57 ++++++------
 .../tensor_coordinate_deprecated.hpp          | 18 ++--
 ...e_generic_tensor_slice_copy_deprecated.hpp | 89 +++++++++---------
 .../threadwise_direct_convolution.hpp         |  2 +-
 .../threadwise_generic_tensor_op.hpp          |  4 +-
 ...e_generic_tensor_slice_copy_deprecated.hpp | 26 +++---
 .../include/utility/amd_intrinsic.hpp         | 50 +++++-----
 driver/include/conv_common.hpp                |  2 +-
 driver/include/host_conv.hpp                  |  2 +-
 driver/src/driver.cpp                         |  2 +-
 28 files changed, 351 insertions(+), 348 deletions(-)
 rename composable_kernel/include/tensor_description/{ConstantMergedTensorDescriptor.hpp => ConstantMergedTensorDescriptor_deprecated.hpp} (92%)
 rename composable_kernel/include/tensor_description/{ConstantTensorDescriptor.hpp => ConstantTensorDescriptor_deprecated.hpp} (90%)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
index 495835384..aae74b613 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "threadwise_tensor_slice_copy.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
index ce6965ec6..d33a4adf9 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
index 23c1be527..6975b1e24 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_3d_tensor_op.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
index dd3cd21c6..def4ae086 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
@@ -125,38 +125,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 
         // blockwise copy
         // input: format is [C, Hi, Wi, N]
-        auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               decltype(in_c_h_w_n_global_desc),
-                                               decltype(in_c_h_w_n_block_desc),
-                                               decltype(in_c_h_w_n_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_CHWN,
-                                               InBlockCopyClusterLengths_CHWN,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               3,
-                                               3,
-                                               InBlockCopyDataPerAccess_N,
-                                               InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
-                                                                           {0, 0, 0, 0});
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
+            BlockSize,
+            decltype(in_c_h_w_n_global_desc),
+            decltype(in_c_h_w_n_block_desc),
+            decltype(in_c_h_w_n_block_desc.GetLengths()),
+            InBlockCopySubLengths_CHWN,
+            InBlockCopyClusterLengths_CHWN,
+            Sequence<0, 1, 2, 3>,
+            Sequence<0, 1, 2, 3>,
+            Sequence<0, 1, 2, 3>,
+            3,
+            3,
+            InBlockCopyDataPerAccess_N,
+            InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});
 
         // blockwise wei copy
         //   format is [CPerBlock, X * KPerBlock]
         const auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_CK,
-                                               WeiBlockCopyClusterLengths_CK,
-                                               Sequence<0, 1>,
-                                               Sequence<0, 1>,
-                                               Sequence<0, 1>,
-                                               1,
-                                               1,
-                                               WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
+                                                          decltype(wei_c_k_global_desc),
+                                                          decltype(wei_c_k_block_desc),
+                                                          decltype(wei_c_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_CK,
+                                                          WeiBlockCopyClusterLengths_CK,
+                                                          Sequence<0, 1>,
+                                                          Sequence<0, 1>,
+                                                          Sequence<0, 1>,
+                                                          1,
+                                                          1,
+                                                          WeiBlockCopyDataPerAccess_K,
+                                                          WeiBlockCopyDataPerAccess_K>({0, 0},
+                                                                                       {0, 0});
 
         // a series of blockwise batched GEMM
         // C_matrix += transpose(A_matrix) * B_matrix
@@ -318,14 +318,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                 n_block_data_begin + n_thread_data_begin);
 
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
+                decltype(out_10d_thread_desc),
+                decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 10, 1>::type,
+                9,
+                OutThreadCopyDataPerAccess_N,
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
             ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
@@ -388,14 +389,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                 n_block_data_begin + n_thread_data_begin);
 
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
+                decltype(out_10d_thread_desc),
+                decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 10, 1>::type,
+                9,
+                OutThreadCopyDataPerAccess_N,
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
             ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
index 2150360a3..9528d7cb9 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
@@ -127,9 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
         // input: format is [C, Hi, Wi, N]
         auto blockwise_in_copy =
 #if 0
-            BlockwiseGenericTensorSliceCopy_v1
+            BlockwiseGenericTensorSliceCopy_v1_deprecated
 #else
-            BlockwiseGenericTensorSliceCopy_v2
+            BlockwiseGenericTensorSliceCopy_v2_deprecated
 #endif
             <BlockSize,
              decltype(in_c_h_w_n_global_desc),
@@ -149,9 +149,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
         //   format is [CPerBlock, X * KPerBlock]
         const auto blockwise_wei_copy =
 #if 0
-            BlockwiseGenericTensorSliceCopy_v1
+            BlockwiseGenericTensorSliceCopy_v1_deprecated
 #else
-            BlockwiseGenericTensorSliceCopy_v2
+            BlockwiseGenericTensorSliceCopy_v2_deprecated
 #endif
             <BlockSize,
              decltype(wei_c_k_global_desc),
@@ -406,14 +406,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                 n_block_data_begin + n_thread_data_begin);
 
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
+                decltype(out_10d_thread_desc),
+                decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 10, 1>::type,
+                9,
+                OutThreadCopyDataPerAccess_N,
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
             ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
@@ -476,14 +477,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                 n_block_data_begin + n_thread_data_begin);
 
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
+                decltype(out_10d_thread_desc),
+                decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 10, 1>::type,
+                9,
+                OutThreadCopyDataPerAccess_N,
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
             ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
index 2d794033c..8fad9b864 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_PADDED_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
index 042cb8f91..a5736272b 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_tensor_slice_copy.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
index 5b9216a8d..8d757056e 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_tensor_slice_copy.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
index 19defcba6..dc02655f3 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
index a9e432d9f..4b3ab8f7c 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
index 56b2f5f0b..5ae7dc87d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
             BlockSize,
             Float,
             decltype(in_c_n1_b_n2_global_merged_desc),
@@ -155,20 +155,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
-        auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
-                                               decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_C_K,
-                                               WeiBlockCopyClusterLengths_C_K,
-                                               Sequence<0, 1>, // thread_arrange_order [C, K]
-                                               Sequence<0, 1>, // src_access_order [C, K]
-                                               Sequence<0, 1>, // dst_access_order [C, K]
-                                               WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>(
-                {0, k_block_data_on_global}, {0, 0});
+        auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
+            BlockSize,
+            Float,
+            decltype(wei_c_k_global_desc),
+            decltype(wei_c_k_block_desc),
+            decltype(wei_c_k_block_desc.GetLengths()),
+            WeiBlockCopySubLengths_C_K,
+            WeiBlockCopyClusterLengths_C_K,
+            Sequence<0, 1>, // thread_arrange_order [C, K]
+            Sequence<0, 1>, // src_access_order [C, K]
+            Sequence<0, 1>, // dst_access_order [C, K]
+            WeiBlockCopyDataPerAccess_K,
+            WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
         // c_mtx += transpose(a_mtx) * b_mtx
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
index 25d73df49..2a08be324 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -125,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
             BlockSize,
             Float,
             decltype(in_c_n1_b_n2_global_merged_desc),
@@ -152,20 +152,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
-        const auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
-                                               decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_C_K,
-                                               WeiBlockCopyClusterLengths_C_K,
-                                               Sequence<0, 1>, // thread_arrange_order [C, K]
-                                               Sequence<0, 1>, // src_access_order [C, K]
-                                               Sequence<0, 1>, // dst_access_order [C, K]
-                                               WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>(
-                {0, k_block_data_on_global}, {0, 0});
+        const auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
+            BlockSize,
+            Float,
+            decltype(wei_c_k_global_desc),
+            decltype(wei_c_k_block_desc),
+            decltype(wei_c_k_block_desc.GetLengths()),
+            WeiBlockCopySubLengths_C_K,
+            WeiBlockCopyClusterLengths_C_K,
+            Sequence<0, 1>, // thread_arrange_order [C, K]
+            Sequence<0, 1>, // src_access_order [C, K]
+            Sequence<0, 1>, // dst_access_order [C, K]
+            WeiBlockCopyDataPerAccess_K,
+            WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
         // c_mtx += transpose(a_mtx) * b_mtx
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 53366f79d..1b6c87717 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
 #include "blockwise_gemm.hpp"
@@ -157,21 +157,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(in_e_n1_b_n2_global_merged_desc),
-                                               decltype(in_e_n1_b_n2_block_desc),
-                                               decltype(in_e_n1_b_n2_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_E_N1_B_N2,
-                                               InBlockCopyClusterLengths_E_N1_B_N2,
-                                               InBlockCopyThreadClusterArrangeOrder,
-                                               InBlockCopySrcAccessOrder,
-                                               InBlockCopyDstAccessOrder,
-                                               2,
-                                               3,
-                                               InBlockCopySrcDataPerRead_B,
-                                               InBlockCopyDstDataPerWrite_N2>(
-                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated<
+            BlockSize,
+            decltype(in_e_n1_b_n2_global_merged_desc),
+            decltype(in_e_n1_b_n2_block_desc),
+            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            2,
+            3,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
@@ -188,19 +187,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               0,
-                                               1,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          0,
+                                                          1,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -381,7 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             const index_t b_thread_data_on_global =
                 b_block_data_on_global + c_thread_mtx_on_block.col / N2;
 
-            ThreadwiseGenericTensorSliceCopy_v2r1<
+            ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
                 decltype(out_k0_k1_n1_b_n2_thread_mem_desc),
                 decltype(out_k0_k1_n1_b_n2_global_merged_desc),
                 decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()),
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
index bedaa0cad..3fe68ca3a 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R2_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -166,7 +166,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
             BlockSize,
             Float,
             decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc),
@@ -196,18 +196,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
+                                                          Float,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
index c894f69bd..bc50bf19c 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R3_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -165,7 +165,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
             BlockSize,
             Float,
             decltype(in_e_n1_ho1_wo1_b_n2_ho2_wo2_global_merged_desc),
@@ -195,18 +195,18 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
+                                                          Float,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
 #if 0
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 39a28e391..e741a83c4 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
 #include "blockwise_gemm.hpp"
@@ -133,19 +133,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
         auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(in_e_b_global_desc),
-                                               decltype(in_e_b_block_desc),
-                                               decltype(in_e_b_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_E_B,
-                                               InBlockCopyClusterLengths_E_B,
-                                               InBlockCopyThreadClusterArrangeOrder,
-                                               InBlockCopySrcAccessOrder,
-                                               InBlockCopyDstAccessOrder,
-                                               1,
-                                               1,
-                                               InBlockCopyDataPerAccess_B,
-                                               InBlockCopyDataPerAccess_B>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(in_e_b_global_desc),
+                                                          decltype(in_e_b_block_desc),
+                                                          decltype(in_e_b_block_desc.GetLengths()),
+                                                          InBlockCopySubLengths_E_B,
+                                                          InBlockCopyClusterLengths_E_B,
+                                                          InBlockCopyThreadClusterArrangeOrder,
+                                                          InBlockCopySrcAccessOrder,
+                                                          InBlockCopyDstAccessOrder,
+                                                          1,
+                                                          1,
+                                                          InBlockCopyDataPerAccess_B,
+                                                          InBlockCopyDataPerAccess_B>(
                 {0, b_block_data_on_global}, {0, 0});
 
         // weight tensor
@@ -169,19 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               0,
-                                               1,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          0,
+                                                          1,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -373,20 +373,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             using OutThreadCopySliceLengths =
                 Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>;
 
-            auto threadwise_out_copy =
-                ThreadwiseGenericTensorSliceCopy_v2r1<decltype(out_k0_k1_b_thread_desc),
-                                                      decltype(out_k0_k1_b_global_desc),
-                                                      OutThreadCopySliceLengths,
-                                                      arithmetic_sequence_gen<0, 3, 1>::type,
-                                                      arithmetic_sequence_gen<0, 3, 1>::type,
-                                                      2,
-                                                      2,
-                                                      OutThreadCopyDataPerAccess_B,
-                                                      OutThreadCopyDataPerAccess_B>(
-                    {0, 0, 0},
-                    {k_thread_data_on_global / K1,
-                     k_thread_data_on_global % K1,
-                     b_thread_data_on_global});
+            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
+                decltype(out_k0_k1_b_thread_desc),
+                decltype(out_k0_k1_b_global_desc),
+                OutThreadCopySliceLengths,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                2,
+                2,
+                OutThreadCopyDataPerAccess_B,
+                OutThreadCopyDataPerAccess_B>({0, 0, 0},
+                                              {k_thread_data_on_global / K1,
+                                               k_thread_data_on_global % K1,
+                                               b_thread_data_on_global});
 
             for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
             {
diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
index ada40e8ba..0ebd9dc4a 100644
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -2,7 +2,7 @@
 #define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "tensor_descriptor.hpp"
 
 namespace ck {
@@ -59,9 +59,10 @@ __host__ __device__ constexpr auto
 }
 
 template <typename... Ts>
-__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor<Ts...>)
+__host__ __device__ constexpr auto
+    make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
 {
-    using TDesc = ConstantTensorDescriptor<Ts...>;
+    using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
     static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
     static_assert(TDesc::GetStrides()[1] == 1, "wrong");
     return ConstantMatrixDescriptor<TDesc::GetLengths()[0],
diff --git a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
similarity index 92%
rename from composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
rename to composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
index b00552e07..814e47d1c 100644
--- a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor_deprecated.hpp
@@ -1,26 +1,26 @@
-#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
-#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
+#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 
 namespace ck {
 
-// OriginalTensorDesc : ConstantTensorDescriptor<...>
+// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...>
 //     it's the tensor whose dimensions are to be merged
 // OriginalDimMergeSeqs : Sequence<...>...
 //     each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
 template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-struct ConstantMergedTensorDescriptor
+struct ConstantMergedTensorDescriptor_deprecated
 {
-    using Type = ConstantMergedTensorDescriptor;
+    using Type = ConstantMergedTensorDescriptor_deprecated;
 
     static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
 
     static constexpr index_t nDim         = sizeof...(OriginalDimMergeSeqs);
     static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
 
-    __host__ __device__ constexpr ConstantMergedTensorDescriptor()
+    __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated()
     {
         static_assert(nDim <= nOriginalDim, "wrong!");
 
@@ -189,7 +189,7 @@ struct ConstantMergedTensorDescriptor
     {
         constexpr auto lengths = GetLengths();
         constexpr auto strides = calculate_tensor_strides_packed(lengths);
-        return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(lengths), decltype(strides)>{};
     }
 };
 
@@ -197,7 +197,7 @@ template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
 __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
                                                                        OriginalDimMergeSeqs...)
 {
-    return ConstantMergedTensorDescriptor<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
+    return ConstantMergedTensorDescriptor_deprecated<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
 }
 
 template <class TDesc>
diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
similarity index 90%
rename from composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
rename to composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
index 6dfbe5f79..d14696414 100644
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
-#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
+#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
+#define CK_CONSTANT_TENSOR_DESCRIPTOR_DEPRECATED_HPP
 
 #include "common_header.hpp"
 
@@ -24,13 +24,13 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths,
 }
 
 template <class Lengths, class Strides>
-struct ConstantTensorDescriptor
+struct ConstantTensorDescriptor_deprecated
 {
-    using Type = ConstantTensorDescriptor;
+    using Type = ConstantTensorDescriptor_deprecated;
 
     static constexpr index_t nDim = Lengths::GetSize();
 
-    __host__ __device__ constexpr ConstantTensorDescriptor()
+    __host__ __device__ constexpr ConstantTensorDescriptor_deprecated()
     {
         static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent");
     }
@@ -284,7 +284,7 @@ struct ConstantTensorDescriptor
         using extract_lengths = decltype(Lengths::Extract(extract_dims...));
         using extract_strides = decltype(Strides::Extract(extract_dims...));
 
-        return ConstantTensorDescriptor<extract_lengths, extract_strides>{};
+        return ConstantTensorDescriptor_deprecated<extract_lengths, extract_strides>{};
     }
 
     template <index_t... IDims>
@@ -294,13 +294,13 @@ struct ConstantTensorDescriptor
     }
 
     template <class... Ts>
-    __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor<Ts...>)
+    __host__ __device__ static constexpr auto Embed(ConstantTensorDescriptor_deprecated<Ts...>)
     {
-        using leaf_tensor = ConstantTensorDescriptor<Ts...>;
+        using leaf_tensor = ConstantTensorDescriptor_deprecated<Ts...>;
 
-        return ConstantTensorDescriptor<decltype(GetLengths().PushBack(leaf_tensor::GetLengths())),
-                                        decltype(
-                                            GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
+        return ConstantTensorDescriptor_deprecated<
+            decltype(GetLengths().PushBack(leaf_tensor::GetLengths())),
+            decltype(GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
     }
 
     template <index_t IDimVector, index_t DataPerVector>
@@ -351,7 +351,7 @@ struct ConstantTensorDescriptor
         using vectorized_strides =
             decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
 
-        return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
+        return ConstantTensorDescriptor_deprecated<vectorized_lengths, vectorized_strides>{};
     }
 
     template <index_t IDim, index_t SliceLen>
@@ -359,7 +359,7 @@ struct ConstantTensorDescriptor
     {
         using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));
 
-        return ConstantTensorDescriptor<slice_lengths, Strides>{};
+        return ConstantTensorDescriptor_deprecated<slice_lengths, Strides>{};
     }
 
     template <index_t... Is>
@@ -367,7 +367,7 @@ struct ConstantTensorDescriptor
     {
         static_assert(slice_lengths.GetSize() == nDim, "wrong!");
 
-        return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
+        return ConstantTensorDescriptor_deprecated<decltype(slice_lengths), Strides>{};
     }
 
     template <index_t IDim, index_t SliceLength, index_t SliceStride>
@@ -379,7 +379,7 @@ struct ConstantTensorDescriptor
         using new_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLength>{}));
         using new_strides = decltype(Strides::Modify(Number<IDim>{}, Number<new_stride>{}));
 
-        return ConstantTensorDescriptor<new_lengths, new_strides>{};
+        return ConstantTensorDescriptor_deprecated<new_lengths, new_strides>{};
     }
 
     template <index_t IDim, index_t... FoldIntervals>
@@ -418,7 +418,7 @@ struct ConstantTensorDescriptor
         constexpr auto new_strides =
             GetStrides().Extract(left).PushBack(fold_strides).PushBack(GetStrides().Extract(right));
 
-        return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
     }
 
     template <index_t IDim, index_t... FoldIntervals>
@@ -462,27 +462,29 @@ struct ConstantTensorDescriptor
                                          .PushBack(Number<unfold_stride>{})
                                          .PushBack(GetStrides().Extract(right));
 
-        return ConstantTensorDescriptor<decltype(new_lengths), decltype(new_strides)>{};
+        return ConstantTensorDescriptor_deprecated<decltype(new_lengths), decltype(new_strides)>{};
     }
 
     __host__ __device__ static constexpr auto Pack()
     {
         using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
-        return ConstantTensorDescriptor<Lengths, packed_strides>{};
+        return ConstantTensorDescriptor_deprecated<Lengths, packed_strides>{};
     }
 
     template <class MapNew2Old>
     __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old)
     {
-        return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
-                                        decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
+        return ConstantTensorDescriptor_deprecated<
+            decltype(Lengths::ReorderGivenNew2Old(MapNew2Old{})),
+            decltype(Strides::ReorderGivenNew2Old(MapNew2Old{}))>{};
     }
 
     template <class MapOld2New>
     __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
     {
-        return ConstantTensorDescriptor<decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
-                                        decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{};
+        return ConstantTensorDescriptor_deprecated<
+            decltype(Lengths::ReorderGivenOld2New(MapOld2New{})),
+            decltype(Strides::ReorderGivenOld2New(MapOld2New{}))>{};
     }
 };
 
@@ -490,26 +492,25 @@ template <class Lengths>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
 {
     using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 
 template <class Lengths, class Strides>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides)
 {
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 
 template <class Lengths, index_t Align>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
 {
     using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
-    return ConstantTensorDescriptor<Lengths, Strides>{};
+    return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 
 template <index_t... Lengths, index_t... Strides>
-__host__ __device__ void
-print_ConstantTensorDescriptor(const char* s,
-                               ConstantTensorDescriptor<Sequence<Lengths...>, Sequence<Strides...>>)
+__host__ __device__ void print_ConstantTensorDescriptor(
+    const char* s, ConstantTensorDescriptor_deprecated<Sequence<Lengths...>, Sequence<Strides...>>)
 {
     constexpr index_t ndim = sizeof...(Lengths);
 
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
index 46e551ddd..aaddc1251 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -2,12 +2,12 @@
 #define CK_TENSOR_COORDINATE_DEPRECATED_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 
 namespace ck {
 
-// TensorDesc is ConstantTensorDescriptor
+// TensorDesc is ConstantTensorDescriptor_deprecated
 template <class TensorDesc>
 struct NormalTensorCoordinate_deprecated
 {
@@ -95,7 +95,7 @@ struct NormalTensorCoordinate_deprecated
     index_t mOffset;
 };
 
-// TensorDesc is ConstantMergedTensorDescriptor
+// TensorDesc is ConstantMergedTensorDescriptor_deprecated
 template <class TensorDesc>
 struct MergedTensorCoordinate
 {
@@ -311,7 +311,7 @@ struct MergedTensorCoordinate
     // dimensions, and those merged dimensions, that would never be involved in index
     // arithmetic after construction of TensorCoordinate.
     // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
-    // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
+    // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to
     // count on compiler to optimize away those register memory for us
     Array<index_t, nOriginalDim> mOriginalIndex;
     Array<index_t, nDim> mPartialOffsets;
@@ -326,16 +326,16 @@ struct TensorCoordinate_deprecated
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
     {
-        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
+        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
     {
-        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
+        return MergedTensorCoordinate<ConstantMergedTensorDescriptor_deprecated<Ts...>>();
     }
 
     public:
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index ca3902039..c922384a9 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -2,8 +2,8 @@
 #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "tensor_coordinate_deprecated.hpp"
 #include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
@@ -16,7 +16,7 @@ namespace ck {
 // that, on a merged dimension that constains multiple original dimensions, the length of
 // the last original dimension need to be evenly dividable by its sub-lengths. Also, the
 // repeat-length on the merged dimension need to be 1. These sanity checks are performed
-// in constructor of BlockwiseGenericTensorSliceCopy_v1
+// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated
 template <index_t BlockSize,
           typename SrcDesc,
           typename DstDesc,
@@ -30,7 +30,7 @@ template <index_t BlockSize,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
           index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v1
+struct BlockwiseGenericTensorSliceCopy_v1_deprecated
 {
     static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
 
@@ -58,7 +58,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
     Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
     Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
 
-    __device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
+    __device__
+    BlockwiseGenericTensorSliceCopy_v1_deprecated(Array<index_t, nDim> src_block_data_id_begin,
                                                   Array<index_t, nDim> dst_block_data_id_begin)
     {
         // check NDim consistency
@@ -240,15 +241,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
             // that constains multiple original dimensions, the length of the last original
             // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
             // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
-            ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
-                                                  decltype(thread_buffer_desc),
-                                                  SubLengths,
-                                                  SrcDimAccessOrder,
-                                                  SrcVectorAccessDim,
-                                                  SrcDataPerAccess,
-                                                  1>(make_zero_array<index_t, nDim>(),
-                                                     make_zero_array<index_t, nDim>())
+            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<SrcDesc,
+                                                             decltype(thread_buffer_desc),
+                                                             SubLengths,
+                                                             SrcDimAccessOrder,
+                                                             SrcVectorAccessDim,
+                                                             SrcDataPerAccess,
+                                                             1>(make_zero_array<index_t, nDim>(),
+                                                                make_zero_array<index_t, nDim>())
                 .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
         });
     }
@@ -295,14 +296,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
             // that constains multiple original dimensions, the length of the last original
             // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
             // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
-                                                  DstDesc,
-                                                  SubLengths,
-                                                  DstDimAccessOrder,
-                                                  DstVectorAccessDim,
-                                                  1,
-                                                  DstDataPerAccess>(
+            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<decltype(thread_buffer_desc),
+                                                             DstDesc,
+                                                             SubLengths,
+                                                             DstDimAccessOrder,
+                                                             DstVectorAccessDim,
+                                                             1,
+                                                             DstDataPerAccess>(
                 make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
                 .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
         });
@@ -428,14 +429,14 @@ template <index_t BlockSize,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
           index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v2
+struct BlockwiseGenericTensorSliceCopy_v2_deprecated
 {
     static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
 
     using Index = MultiIndex<nDim>;
 
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2(const Index& src_block_slice_origin,
-                                                            const Index& dst_block_slice_origin)
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated(
+        const Index& src_block_slice_origin, const Index& dst_block_slice_origin)
     {
         static_assert(
             nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
@@ -529,25 +530,25 @@ struct BlockwiseGenericTensorSliceCopy_v2
     private:
     using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
 
-    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
-                                                                 ThreadBufferDesc,
-                                                                 SubLengths,
-                                                                 SrcDimAccessOrder,
-                                                                 SrcDimAccessOrder,
-                                                                 SrcVectorAccessDim,
-                                                                 SrcVectorAccessDim,
-                                                                 SrcDataPerAccess,
-                                                                 1>;
-
-    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1<ThreadBufferDesc,
-                                                                  DstDesc,
-                                                                  SubLengths,
-                                                                  DstDimAccessOrder,
-                                                                  DstDimAccessOrder,
-                                                                  DstVectorAccessDim,
-                                                                  DstVectorAccessDim,
-                                                                  1,
-                                                                  DstDataPerAccess>;
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<SrcDesc,
+                                                                            ThreadBufferDesc,
+                                                                            SubLengths,
+                                                                            SrcDimAccessOrder,
+                                                                            SrcDimAccessOrder,
+                                                                            SrcVectorAccessDim,
+                                                                            SrcVectorAccessDim,
+                                                                            SrcDataPerAccess,
+                                                                            1>;
+
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<ThreadBufferDesc,
+                                                                             DstDesc,
+                                                                             SubLengths,
+                                                                             DstDimAccessOrder,
+                                                                             DstDimAccessOrder,
+                                                                             DstVectorAccessDim,
+                                                                             DstVectorAccessDim,
+                                                                             1,
+                                                                             DstDataPerAccess>;
 
     ThreadwiseLoad mThreadwiseLoad;
     ThreadwiseStore mThreadwiseStore;
diff --git a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
index 3e84cbd8b..bae080b04 100644
--- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
@@ -2,7 +2,7 @@
 #define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "threadwise_tensor_slice_copy.hpp"
 
 namespace ck {
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
index c0b4e8939..8b83b68c7 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
@@ -2,8 +2,8 @@
 #define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 
 namespace ck {
 template <class Float, class TDesc>
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index 9f6133f8d..0310addd3 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -2,8 +2,8 @@
 #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "tensor_coordinate_deprecated.hpp"
 
 namespace ck {
@@ -21,11 +21,11 @@ template <typename SrcDesc,
           index_t VectorAccessDim,
           index_t SrcDataPerAccess,
           index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1r2
+struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
 {
     static constexpr index_t nDim = SliceLengths::GetSize();
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2(
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(
         Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
         : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
     {
@@ -64,9 +64,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
         });
     }
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2()
-        : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated()
+        : ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array<index_t, nDim>(),
+                                                           make_zero_array<index_t, nDim>())
     {
     }
 
@@ -204,7 +204,7 @@ template <typename SrcDesc,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
           index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v2r1
+struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
 {
     static constexpr index_t nDim = SliceLengths::GetSize();
 
@@ -213,8 +213,8 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
     using SrcCoordinate = typename TensorCoordinate_deprecated<SrcDesc>::type;
     using DstCoordinate = typename TensorCoordinate_deprecated<DstDesc>::type;
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(const Index& src_slice_origin,
-                                                               const Index& dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(
+        const Index& src_slice_origin, const Index& dst_slice_origin)
         : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
     {
         static_assert(nDim == SrcDesc::GetNumOfDimension() &&
@@ -262,9 +262,9 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
             });
     }
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
-        : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated()
+        : ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(make_zero_array<index_t, nDim>(),
+                                                           make_zero_array<index_t, nDim>())
     {
     }
 
diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp
index 193a55bc7..0927b2c62 100644
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp
@@ -11,19 +11,17 @@ __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
                                            bool glc,
                                            bool slc) __asm("llvm.amdgcn.buffer.load");
 
-__device__ vector_type<float, 2>::MemoryType
-__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
-                            uint32_t vindex,
-                            uint32_t offset,
-                            bool glc,
-                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
-
-__device__ vector_type<float, 4>::MemoryType
-__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
-                            uint32_t vindex,
-                            uint32_t offset,
-                            bool glc,
-                            bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
+__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
+                                                uint32_t vindex,
+                                                uint32_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
+
+__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
+                                                uint32_t vindex,
+                                                uint32_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
 
 __device__ void __llvm_amdgcn_buffer_store(float vdata,
                                            int32x4_t rsrc,
@@ -32,14 +30,14 @@ __device__ void __llvm_amdgcn_buffer_store(float vdata,
                                            bool glc,
                                            bool slc) __asm("llvm.amdgcn.buffer.store");
 
-__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
+__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
                                              int32x4_t rsrc,
                                              uint32_t vindex,
                                              uint32_t offset,
                                              bool glc,
                                              bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
 
-__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
+__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
                                              int32x4_t rsrc,
                                              uint32_t vindex,
                                              uint32_t offset,
@@ -106,11 +104,12 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
 }
 
 template <>
-__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
+__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
+                                            uint32_t src_thread_data_offset,
+                                            uint32_t src_const_data_offset)
 {
 #if 0
-    vector_type<float, 2>::MemoryType dst;
+    float2_t dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
@@ -132,7 +131,7 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
 
     return dst;
 #else
-    vector_type<float, 2>::MemoryType dst;
+    float2_t dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
@@ -153,11 +152,12 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
 }
 
 template <>
-__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
-    const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
+__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
+                                            uint32_t src_thread_data_offset,
+                                            uint32_t src_const_data_offset)
 {
 #if 0
-    vector_type<float, 4>::MemoryType dst;
+    float4_t dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
@@ -179,7 +179,7 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
 
     return dst;
 #elif 1
-    vector_type<float, 4>::MemoryType dst;
+    float4_t dst;
 
     uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
@@ -243,7 +243,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
 }
 
 template <>
-__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
+__device__ void __buffer_store<float, 2>(const float2_t& src,
                                          float* p_dst_block,
                                          uint32_t dst_thread_data_offset,
                                          uint32_t dst_const_data_offset)
@@ -286,7 +286,7 @@ __device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType
 }
 
 template <>
-__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
+__device__ void __buffer_store<float, 4>(const float4_t& src,
                                          float* p_dst_block,
                                          uint32_t dst_thread_data_offset,
                                          uint32_t dst_const_data_offset)
diff --git a/driver/include/conv_common.hpp b/driver/include/conv_common.hpp
index 636e22290..f37645df2 100644
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -1,7 +1,7 @@
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
 
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
diff --git a/driver/include/host_conv.hpp b/driver/include/host_conv.hpp
index 9ca7fc10d..880fd5efe 100644
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 
 // this is ugly, only for 4d
 template <class TConstTensorDesc>
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 4319c4f7d..9d47b96f9 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -4,7 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "config.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "device.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"

From abe9c0bf7b860fa2eb82283a69e1d417cdd46495 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 19:13:14 -0500
Subject: [PATCH 06/20] change data type of offset arguments of __buffer_load
 and __buffer_store from uint32_t to index_t

---
 .../include/utility/amd_intrinsic.hpp         | 104 +++++++++---------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp
index 0927b2c62..a5bbd8782 100644
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp
@@ -6,65 +6,65 @@
 namespace ck {
 
 __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
-                                           uint32_t vindex,
-                                           uint32_t offset,
+                                           index_t vindex,
+                                           index_t offset,
                                            bool glc,
                                            bool slc) __asm("llvm.amdgcn.buffer.load");
 
 __device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
-                                                uint32_t vindex,
-                                                uint32_t offset,
+                                                index_t vindex,
+                                                index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
 
 __device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
-                                                uint32_t vindex,
-                                                uint32_t offset,
+                                                index_t vindex,
+                                                index_t offset,
                                                 bool glc,
                                                 bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
 
 __device__ void __llvm_amdgcn_buffer_store(float vdata,
                                            int32x4_t rsrc,
-                                           uint32_t vindex,
-                                           uint32_t offset,
+                                           index_t vindex,
+                                           index_t offset,
                                            bool glc,
                                            bool slc) __asm("llvm.amdgcn.buffer.store");
 
 __device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
                                              int32x4_t rsrc,
-                                             uint32_t vindex,
-                                             uint32_t offset,
+                                             index_t vindex,
+                                             index_t offset,
                                              bool glc,
                                              bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
 
 __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
                                              int32x4_t rsrc,
-                                             uint32_t vindex,
-                                             uint32_t offset,
+                                             index_t vindex,
+                                             index_t offset,
                                              bool glc,
                                              bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
 
 // buffer_load and buffer_store
 template <typename T, index_t VectorSize>
-__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
-    const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
+__device__ typename vector_type<T, VectorSize>::MemoryType
+__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
 
 template <typename T, index_t VectorSize>
 __device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
                                T* p_dst_block,
-                               uint32_t dst_thread_data_offset,
-                               uint32_t dst_const_data_offset);
+                               index_t dst_thread_data_offset,
+                               index_t dst_const_data_offset);
 
 template <>
 __device__ float __buffer_load<float, 1>(const float* p_src_block,
-                                         uint32_t src_thread_data_offset,
-                                         uint32_t src_const_data_offset)
+                                         index_t src_thread_data_offset,
+                                         index_t src_const_data_offset)
 {
 #if 0
     float dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -85,8 +85,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
 #else
     float dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -105,14 +105,14 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
 
 template <>
 __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
-                                            uint32_t src_thread_data_offset,
-                                            uint32_t src_const_data_offset)
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
 {
 #if 0
     float2_t dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -133,8 +133,8 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
 #else
     float2_t dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -153,14 +153,14 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
 
 template <>
 __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
-                                            uint32_t src_thread_data_offset,
-                                            uint32_t src_const_data_offset)
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
 {
 #if 0
     float4_t dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -181,8 +181,8 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
 #elif 1
     float4_t dst;
 
-    uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    uint32_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
     int32x4_t src_block_setting{0};
     // fill in byte 0 - 1
@@ -202,12 +202,12 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
 template <>
 __device__ void __buffer_store<float, 1>(const float& src,
                                          float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
 {
 #if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -226,8 +226,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
 #else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -245,12 +245,12 @@ __device__ void __buffer_store<float, 1>(const float& src,
 template <>
 __device__ void __buffer_store<float, 2>(const float2_t& src,
                                          float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
 {
 #if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -269,8 +269,8 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
 #else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -288,12 +288,12 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
 template <>
 __device__ void __buffer_store<float, 4>(const float4_t& src,
                                          float* p_dst_block,
-                                         uint32_t dst_thread_data_offset,
-                                         uint32_t dst_const_data_offset)
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
 {
 #if 0
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1
@@ -312,8 +312,8 @@ __device__ void __buffer_store<float, 4>(const float4_t& src,
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
 #else
-    uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    uint32_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
     int32x4_t dst_block_setting{0};
     // fill in byte 0 - 1

From f2a2c583744b0d183285b70c8edeec83f548df5b Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 19:51:03 -0500
Subject: [PATCH 07/20] nvidia build

---
 CMakeLists.txt                                |  6 +-
 .../tensor_operation/threadwise_gemm.hpp      |  2 +-
 .../include/utility/common_header.hpp         |  1 +
 .../{config_amd.hpp.in => config.amd.hpp.in}  | 63 +++++++++++++++++++
 ...fig_nvidia.hpp.in => config.nvidia.hpp.in} | 32 ++++------
 .../include/utility/float_type.nvidia.hpp.in  | 51 +++++++++++++++
 composable_kernel/include/utility/math.hpp    | 62 ------------------
 7 files changed, 132 insertions(+), 85 deletions(-)
 rename composable_kernel/include/utility/{config_amd.hpp.in => config.amd.hpp.in} (59%)
 rename composable_kernel/include/utility/{config_nvidia.hpp.in => config.nvidia.hpp.in} (65%)
 create mode 100644 composable_kernel/include/utility/float_type.nvidia.hpp.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9798220ca..21e5dc682 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,9 +52,11 @@ include_directories(BEFORE
 )
 
 if(DEVICE_BACKEND STREQUAL "AMD")
-    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
 endif()
 
 add_subdirectory(driver)
diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
index 0619aaf15..00d81410e 100644
--- a/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm.hpp
@@ -85,7 +85,7 @@ struct ThreadwiseGemmTransANormalBNormalC
                     const index_t cindex = MatrixC::CalculateOffset(m, n);
 
                     p_c[cindex] +=
-                        math::inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
+                        inner_product_with_conversion<FloatC>{}(p_a[aindex], p_b[bindex]);
                 }
             }
         }
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index ad6b26735..441eecae9 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -2,6 +2,7 @@
 #define CK_COMMON_HEADER_HPP
 
 #include "config.hpp"
+#include "float_type.hpp"
 #include "utility.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
similarity index 59%
rename from composable_kernel/include/utility/config_amd.hpp.in
rename to composable_kernel/include/utility/config.amd.hpp.in
index fe82ba992..b3349e425 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -104,5 +104,68 @@ __device__ ushort type_convert<ushort>::operator()<float>(float x) const
     return float_to_bfloat16(x);
 }
 
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+
+        return acc;
+    }
+
+#if CK_DEVICE_BACKEND_AMD
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
+    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(ushort4_t a, ushort4_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+#endif
+};
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config.nvidia.hpp.in
similarity index 65%
rename from composable_kernel/include/utility/config_nvidia.hpp.in
rename to composable_kernel/include/utility/config.nvidia.hpp.in
index 2eea4a867..6e9198893 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config.nvidia.hpp.in
@@ -6,11 +6,22 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+
+// device backend
 #define CK_DEVICE_BACKEND_NVIDIA 1
-#define CK_USE_AMD_INTRINSIC 0
+
+// disable AMD inline asm and intrinsic
 #define CK_USE_AMD_INLINE_ASM 0
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
+#define CK_USE_AMD_INTRINSIC 0
 #define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0
+
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -30,24 +41,5 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
 
-// For some reason, CUDA need this definition, otherwise
-//   compiler won't generate optimal load and store instruction, and
-//   kernel would produce wrong result, indicating the compiler fail to generate correct
-//   instruction,
-using float2_t = float2;
-using float4_t = float4;
-
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
-
 } // namespace ck
-
 #endif
diff --git a/composable_kernel/include/utility/float_type.nvidia.hpp.in b/composable_kernel/include/utility/float_type.nvidia.hpp.in
new file mode 100644
index 000000000..fbb93a437
--- /dev/null
+++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in
@@ -0,0 +1,51 @@
+#ifndef CK_FLOAT_TYPE_NVIDIA_HPP
+#define CK_FLOAT_TYPE_NVIDIA_HPP
+
+namespace ck {
+
+// For some reason, CUDA need this definition, otherwise
+//   compiler won't generate optimal load and store instruction, and
+//   kernel would produce wrong result, indicating the compiler fail to generate correct
+//   instruction,
+// float
+using float2_t = float2;
+using float4_t = float4;
+
+// float16
+using half2_t = half2;
+
+// data type conversion
+template <typename T>
+struct type_convert
+{
+    template <typename X>
+    __device__ T operator()(const X& x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+
+        return acc;
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp
index f6c41cc52..ba70e7ab2 100644
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -117,68 +117,6 @@ struct less
     __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
 };
 
-template <typename T>
-struct inner_product_with_conversion
-{
-    static constexpr auto convert = type_convert<T>();
-
-    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
-
-    __device__ T operator()(half2_t a, half2_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-
-        return acc;
-    }
-
-    __device__ T operator()(half4_t a, half4_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-        return acc;
-    }
-
-    __device__ T operator()(ushort2_t a, ushort2_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-
-        return acc;
-    }
-
-    __device__ T operator()(ushort4_t a, ushort4_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-        return acc;
-    }
-};
-
 } // namespace math
 } // namspace ck
 

From 434e4f2596afb307cf6e506898eac919c128090b Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 20:08:05 -0500
Subject: [PATCH 08/20] amd build, reorganized files

---
 CMakeLists.txt                                |   1 +
 .../include/utility/config.amd.hpp.in         | 104 +----------------
 .../include/utility/float_type.amd.hpp.in     | 110 ++++++++++++++++++
 .../include}/bfloat16_dev.hpp                 |   0
 4 files changed, 112 insertions(+), 103 deletions(-)
 create mode 100644 composable_kernel/include/utility/float_type.amd.hpp.in
 rename {composable_kernel/include/utility => external/include}/bfloat16_dev.hpp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21e5dc682..20fc8028f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ include_directories(BEFORE
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
     ${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm
+    ${PROJECT_SOURCE_DIR}/external/include
     ${PROJECT_SOURCE_DIR}/driver/include
     ${PROJECT_BINARY_DIR}/composable_kernel/include/utility
 )
diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
index b3349e425..1da362b81 100644
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -62,110 +62,8 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
 
-// For some reason, HIP compiler need this definition to generate optimal ISA
-// float
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-typedef float float32_t __attribute__((ext_vector_type(32)));
-
-// float16
-typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
-typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
-
-// bfloat16
-typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
-typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
-
-// int
+// int32x4_t use by buffer_load and buffer_store llvm intrinsic
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
 
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(X x) const
-    {
-        return static_cast<T>(x);
-    }
-};
-
-template <>
-template <>
-__device__ float type_convert<float>::operator()<ushort>(ushort x) const
-{
-    return bfloat16_to_float(x);
-}
-
-template <>
-template <>
-__device__ ushort type_convert<ushort>::operator()<float>(float x) const
-{
-    return float_to_bfloat16(x);
-}
-
-template <typename T>
-struct inner_product_with_conversion
-{
-    static constexpr auto convert = type_convert<T>();
-
-    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
-
-    __device__ T operator()(half2_t a, half2_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-
-        return acc;
-    }
-
-#if CK_DEVICE_BACKEND_AMD
-    __device__ T operator()(half4_t a, half4_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-        return acc;
-    }
-
-    __device__ T operator()(ushort2_t a, ushort2_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-
-        return acc;
-    }
-
-    __device__ T operator()(ushort4_t a, ushort4_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-        return acc;
-    }
-#endif
-};
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in
new file mode 100644
index 000000000..337d12fa3
--- /dev/null
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -0,0 +1,110 @@
+#ifndef CK_FLOAT_TYPE_AMD_HPP
+#define CK_FLOAT_TYPE_AMD_HPP
+
+#include "bfloat16_dev.hpp"
+
+namespace ck {
+
+// For some reason, HIP compiler need this definition to generate optimal ISA
+// float
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef float float32_t __attribute__((ext_vector_type(32)));
+
+// float16
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+
+// bfloat16
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+
+// data type conversion
+template <typename T>
+struct type_convert
+{
+    template <typename X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
+    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(ushort4_t a, ushort4_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/bfloat16_dev.hpp b/external/include/bfloat16_dev.hpp
similarity index 100%
rename from composable_kernel/include/utility/bfloat16_dev.hpp
rename to external/include/bfloat16_dev.hpp

From e080041bd317b80f3be5eb522dac52edb6454917 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 3 Oct 2019 23:47:44 -0500
Subject: [PATCH 09/20] mark deprecated code

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 276 +++++++++-------
 ...cyx_nkhw_lds_double_buffer_deprecated.hpp} | 282 +++++++---------
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 300 ++++++++---------
 ...cyx_nkhw_lds_double_buffer_deprecated.hpp} | 302 +++++++++---------
 .../include/utility/float_type.amd.hpp.in     |   2 -
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 109 ++++---
 ...t_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp} | 131 ++++----
 ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp |  14 +-
 ...t_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp} |  36 +--
 driver/src/driver.cpp                         |  56 ++--
 10 files changed, 753 insertions(+), 755 deletions(-)
 rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp => gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp} (55%)
 rename composable_kernel/include/kernel_algorithm/{gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp => gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp} (56%)
 rename driver/include/{device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp => device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp} (76%)
 rename driver/include/{device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp => device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp} (92%)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 1b6c87717..724a042c9 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,24 +2,26 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor_deprecated.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "threadwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
-#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
 namespace ck {
 
 // define B = merge(N0, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
-          class Float,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          class ConvStrides,
-          class ConvDilations,
+          typename Float,
+          typename InGlobalDesc,
+          typename WeiGlobalDesc,
+          typename OutGlobalDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -33,18 +35,18 @@ template <index_t GridSize,
           index_t GemmKPerThreadLoop,
           index_t GemmDataPerReadA,
           index_t GemmDataPerReadB,
-          class InBlockCopySubLengths_E_N1_B_N2,
-          class InBlockCopyClusterLengths_E_N1_B_N2,
-          class InBlockCopyThreadClusterArrangeOrder,
-          class InBlockCopySrcAccessOrder,
-          class InBlockCopyDstAccessOrder,
+          typename InBlockCopySubLengths_E_N1_B_N2,
+          typename InBlockCopyClusterLengths_E_N1_B_N2,
+          typename InBlockCopyThreadClusterArrangeOrder,
+          typename InBlockCopySrcAccessOrder,
+          typename InBlockCopyDstAccessOrder,
           index_t InBlockCopySrcDataPerRead_B,
           index_t InBlockCopyDstDataPerWrite_N2,
-          class WeiBlockCopySubLengths_E_K,
-          class WeiBlockCopyClusterLengths_E_K,
-          class WeiBlockCopyThreadClusterArrangeOrder,
-          class WeiBlockCopySrcAccessOrder,
-          class WeiBlockCopyDstAccessOrder,
+          typename WeiBlockCopySubLengths_E_K,
+          typename WeiBlockCopyClusterLengths_E_K,
+          typename WeiBlockCopyThreadClusterArrangeOrder,
+          typename WeiBlockCopySrcAccessOrder,
+          typename WeiBlockCopyDstAccessOrder,
           index_t WeiBlockCopySrcDataPerRead_E,
           index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
@@ -67,20 +69,21 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
-        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
-        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
-        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
+        constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
+        constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
+        constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
 
-        constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0);
-        constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
+        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
 
-        constexpr index_t K  = out_n_k_h_w_global_desc.GetLength(I1);
-        constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2);
-        constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3);
+        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLength(I1);
+        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
         constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
         constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
@@ -113,39 +116,43 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         constexpr index_t BBlockWork = B / BPerBlock;
 
         constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
 
-        const auto block_work_multi_id =
-            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
+        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
 
-        const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
-        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
+        const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
+        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
 
         // input tensor
-        //     tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
-        constexpr auto in_n0_n1_n2_h_w_global_desc =
-            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Ho>{}, Number<ConvStrideH>{})
-                .StridedSlice(I3, Number<Wo>{}, Number<ConvStrideW>{})
-                .Fold(I0, Number<N1>{}, Number<N2>{})
-                .Extract(Sequence<0, 1, 2, 4, 5>{});
-
-        //     batch descritpor for device memory
-        constexpr auto in_c_y_x_global_desc =
-            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Y>{}, Number<ConvDilationH>{})
-                .StridedSlice(I3, Number<X>{}, Number<ConvDilationW>{})
-                .Extract(Sequence<1, 2, 3>{});
-
-        //     merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
-        constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
-            in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
-            Sequence<0, 1, 2>{},
-            Sequence<4>{},
-            Sequence<3, 6, 7>{},
-            Sequence<5>{});
+        //     global memory
+        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(
+                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+
+        constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
+                       PassThrough<C>{},
+                       Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                       Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
+
+        constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor(
+            in_n0_n1_n2_c_y_ho_x_wo_global_desc,
+            make_tuple(Merge<Sequence<C, Y, X>>{},
+                       PassThrough<N1>{},
+                       Merge<Sequence<N0, Ho, Wo>>{},
+                       PassThrough<N2>{}),
+            make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
         //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
         //     be careful of LDS alignment
-        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned(
             Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
 
         //     this check is ad-hoc
@@ -157,49 +164,56 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated<
-            BlockSize,
-            decltype(in_e_n1_b_n2_global_merged_desc),
-            decltype(in_e_n1_b_n2_block_desc),
-            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
-            InBlockCopySubLengths_E_N1_B_N2,
-            InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            2,
-            3,
-            InBlockCopySrcDataPerRead_B,
-            InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+        auto blockwise_in_copy =
+            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                               decltype(in_e_n1_b_n2_global_desc),
+                                               decltype(in_e_n1_b_n2_block_desc),
+                                               decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+                                               InBlockCopySubLengths_E_N1_B_N2,
+                                               InBlockCopyClusterLengths_E_N1_B_N2,
+                                               InBlockCopyThreadClusterArrangeOrder,
+                                               InBlockCopySrcAccessOrder,
+                                               InBlockCopyDstAccessOrder,
+                                               2,
+                                               3,
+                                               InBlockCopySrcDataPerRead_B,
+                                               InBlockCopyDstDataPerWrite_N2>(
+                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
-        constexpr auto wei_e_k_global_desc =
-            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
+            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
+        //     this check is ad-hoc
+        //     TODO: need to properly implement tensor descriptor with multiple alignment
+        //     requirements
+        static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0,
+                      "GemmDataPerReadA alignment requirement is not satisfied");
+
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
-                                                          decltype(wei_e_k_global_desc),
-                                                          decltype(wei_e_k_block_desc),
-                                                          decltype(wei_e_k_block_desc.GetLengths()),
-                                                          WeiBlockCopySubLengths_E_K,
-                                                          WeiBlockCopyClusterLengths_E_K,
-                                                          WeiBlockCopyThreadClusterArrangeOrder,
-                                                          WeiBlockCopySrcAccessOrder,
-                                                          WeiBlockCopyDstAccessOrder,
-                                                          0,
-                                                          1,
-                                                          WeiBlockCopySrcDataPerRead_E,
-                                                          WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                               decltype(wei_e_k_global_desc),
+                                               decltype(wei_e_k_block_desc),
+                                               decltype(wei_e_k_block_desc.GetLengths()),
+                                               WeiBlockCopySubLengths_E_K,
+                                               WeiBlockCopyClusterLengths_E_K,
+                                               WeiBlockCopyThreadClusterArrangeOrder,
+                                               WeiBlockCopySrcAccessOrder,
+                                               WeiBlockCopyDstAccessOrder,
+                                               0,
+                                               1,
+                                               WeiBlockCopySrcDataPerRead_E,
+                                               WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -210,8 +224,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         //     register
         constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc);
 
-        constexpr auto b_e_n1bn2_block_mtx_desc =
-            make_ConstantMatrixDescriptor(in_e_n1_b_n2_block_desc.Unfold(I1, I3));
+        constexpr auto b_e_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(
+            in_e_n1_b_n2_block_desc.GetLength(I0),
+            in_e_n1_b_n2_block_desc.GetLength(I1) * in_e_n1_b_n2_block_desc.GetLength(I2) *
+                in_e_n1_b_n2_block_desc.GetLength(I3),
+            in_e_n1_b_n2_block_desc.GetStride(I0));
 
         // sanity check
         static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) ==
@@ -223,14 +240,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmNPerThreadSubC>{});
+        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
+            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<N1 * N2>{});
 
         const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
             BlockSize,
             decltype(a_e_k_block_mtx_desc),
             decltype(b_e_n1bn2_block_mtx_desc),
-            decltype(c_k0k1_n1n2_thread_mtx_desc),
+            decltype(c_k0k2_n1n2_thread_mtx_desc),
             GemmMPerThreadSubC,
             GemmNPerThreadSubC,
             GemmMLevel0Cluster,
@@ -257,10 +274,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         __shared__ Float p_wei_block_double[2 * wei_block_space];
 
         // register allocation for output
-        Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];
+        Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
 
         // zero out threadwise output
-        threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread);
+        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
 
         // LDS double buffer: preload data into LDS
         {
@@ -350,24 +367,38 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         // copy output: register to global memory
         {
             constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
+            constexpr index_t K0 = K / K1;
 
             // define tensor descriptor for threadwise copy
             //     output memory layout descriptor in register, src of threadwise copy
-            constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed(
+            constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed(
                 Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{});
 
             //     output memory layout descriptor in device memory
-            constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc =
-                out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}).Fold(I0, Number<N1>{}, Number<N2>{});
+            constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor(
+                out_n_k_ho_wo_global_desc,
+                make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
+                           UnMerge<Sequence<K0, K1>>{},
+                           PassThrough<Ho>{},
+                           PassThrough<Wo>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{}));
 
             //     output merged global tensor descriptor, dst of threadwise copy
-            constexpr auto out_k0_k1_n1_b_n2_global_merged_desc =
-                make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc,
-                                                    Sequence<3>{},
-                                                    Sequence<4>{},
-                                                    Sequence<1>{},
-                                                    Sequence<0, 5, 6>{},
-                                                    Sequence<2>{});
+            constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor(
+                out_n0_n1_n2_k0_k1_ho_wo_global_desc,
+                make_tuple(PassThrough<K0>{},
+                           PassThrough<K1>{},
+                           PassThrough<N1>{},
+                           Merge<Sequence<N0, Ho, Wo>>{},
+                           PassThrough<N2>{}),
+                make_tuple(Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<1>{},
+                           Sequence<0, 5, 6>{},
+                           Sequence<2>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -380,26 +411,31 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             const index_t b_thread_data_on_global =
                 b_block_data_on_global + c_thread_mtx_on_block.col / N2;
 
-            ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
-                decltype(out_k0_k1_n1_b_n2_thread_mem_desc),
-                decltype(out_k0_k1_n1_b_n2_global_merged_desc),
-                decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()),
-                arithmetic_sequence_gen<0, 5, 1>::type,
-                arithmetic_sequence_gen<0, 5, 1>::type,
-                3,
-                3,
-                1,
-                1>({0, 0, 0, 0, 0},
-                   {k_thread_data_on_global / K1,
-                    k_thread_data_on_global % K1,
-                    0,
-                    b_thread_data_on_global,
-                    0})
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>(
-                    p_out_thread, p_out_global);
+            ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
+                                                  decltype(out_k0_k1_n1_b_n2_global_desc),
+                                                  decltype(
+                                                      out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
+                                                  arithmetic_sequence_gen<0, 5, 1>::type,
+                                                  3,
+                                                  1,
+                                                  1>({0, 0, 0, 0, 0},
+                                                     {k_thread_data_on_global / K1,
+                                                      k_thread_data_on_global % K1,
+                                                      0,
+                                                      b_thread_data_on_global,
+                                                      0})
+#if 1
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
+#else // tweaking
+                .template Run_optimized_dst_address_calculation<Float,
+                                                                Float,
+                                                                AddressSpace::generic,
+                                                                AddressSpace::global>
+#endif
+                (p_out_thread, p_out_global);
         }
     }
 };
 
 } // namespace ck
-#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
+#endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
similarity index 55%
rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
index d5d1e496b..267e8e0a6 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
@@ -1,27 +1,25 @@
-#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
-#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
+#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP
+#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP
 
 #include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
 #include "blockwise_gemm.hpp"
+#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
 namespace ck {
 
 // define B = merge(N0, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
-          typename Float,
-          typename InGlobalDesc,
-          typename WeiGlobalDesc,
-          typename OutGlobalDesc,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads,
+          class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          class ConvStrides,
+          class ConvDilations,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -35,21 +33,21 @@ template <index_t GridSize,
           index_t GemmKPerThreadLoop,
           index_t GemmDataPerReadA,
           index_t GemmDataPerReadB,
-          typename InBlockCopySubLengths_E_N1_B_N2,
-          typename InBlockCopyClusterLengths_E_N1_B_N2,
-          typename InBlockCopyThreadClusterArrangeOrder,
-          typename InBlockCopySrcAccessOrder,
-          typename InBlockCopyDstAccessOrder,
+          class InBlockCopySubLengths_E_N1_B_N2,
+          class InBlockCopyClusterLengths_E_N1_B_N2,
+          class InBlockCopyThreadClusterArrangeOrder,
+          class InBlockCopySrcAccessOrder,
+          class InBlockCopyDstAccessOrder,
           index_t InBlockCopySrcDataPerRead_B,
           index_t InBlockCopyDstDataPerWrite_N2,
-          typename WeiBlockCopySubLengths_E_K,
-          typename WeiBlockCopyClusterLengths_E_K,
-          typename WeiBlockCopyThreadClusterArrangeOrder,
-          typename WeiBlockCopySrcAccessOrder,
-          typename WeiBlockCopyDstAccessOrder,
+          class WeiBlockCopySubLengths_E_K,
+          class WeiBlockCopyClusterLengths_E_K,
+          class WeiBlockCopyThreadClusterArrangeOrder,
+          class WeiBlockCopySrcAccessOrder,
+          class WeiBlockCopyDstAccessOrder,
           index_t WeiBlockCopySrcDataPerRead_E,
           index_t WeiBlockCopyDstDataPerWrite_K>
-struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
+struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated
 {
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
@@ -69,21 +67,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
+        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
-        constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
-        constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
-        constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
+        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
+        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
+        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
 
-        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLength(I0);
-        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLength(I1);
-        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+        constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0);
+        constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1);
 
-        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLength(I1);
-        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+        constexpr index_t K  = out_n_k_h_w_global_desc.GetLength(I1);
+        constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2);
+        constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3);
 
         constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
         constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
@@ -116,43 +113,39 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr index_t BBlockWork = B / BPerBlock;
 
         constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
 
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
 
-        const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
-        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
+        const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
+        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
 
         // input tensor
-        //     global memory
-        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(
-                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
-
-        constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
-                       PassThrough<C>{},
-                       Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
-                       Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
-
-        constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor(
-            in_n0_n1_n2_c_y_ho_x_wo_global_desc,
-            make_tuple(Merge<Sequence<C, Y, X>>{},
-                       PassThrough<N1>{},
-                       Merge<Sequence<N0, Ho, Wo>>{},
-                       PassThrough<N2>{}),
-            make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        //     tensor descriptor in device memory [N0, N1, N2, Ho, Wo]
+        constexpr auto in_n0_n1_n2_h_w_global_desc =
+            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Ho>{}, Number<ConvStrideH>{})
+                .StridedSlice(I3, Number<Wo>{}, Number<ConvStrideW>{})
+                .Fold(I0, Number<N1>{}, Number<N2>{})
+                .Extract(Sequence<0, 1, 2, 4, 5>{});
+
+        //     batch descritpor for device memory
+        constexpr auto in_c_y_x_global_desc =
+            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Y>{}, Number<ConvDilationH>{})
+                .StridedSlice(I3, Number<X>{}, Number<ConvDilationW>{})
+                .Extract(Sequence<1, 2, 3>{});
+
+        //     merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
+        constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor(
+            in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc),
+            Sequence<0, 1, 2>{},
+            Sequence<4>{},
+            Sequence<3, 6, 7>{},
+            Sequence<5>{});
 
         //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
         //     be careful of LDS alignment
-        constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned(
+        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
             Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
 
         //     this check is ad-hoc
@@ -164,56 +157,49 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(in_e_n1_b_n2_global_desc),
-                                               decltype(in_e_n1_b_n2_block_desc),
-                                               decltype(in_e_n1_b_n2_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_E_N1_B_N2,
-                                               InBlockCopyClusterLengths_E_N1_B_N2,
-                                               InBlockCopyThreadClusterArrangeOrder,
-                                               InBlockCopySrcAccessOrder,
-                                               InBlockCopyDstAccessOrder,
-                                               2,
-                                               3,
-                                               InBlockCopySrcDataPerRead_B,
-                                               InBlockCopyDstDataPerWrite_N2>(
-                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2_deprecated<
+            BlockSize,
+            decltype(in_e_n1_b_n2_global_merged_desc),
+            decltype(in_e_n1_b_n2_block_desc),
+            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            2,
+            3,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
-        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
-            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
+        constexpr auto wei_e_k_global_desc =
+            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
-        //     this check is ad-hoc
-        //     TODO: need to properly implement tensor descriptor with multiple alignment
-        //     requirements
-        static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0,
-                      "GemmDataPerReadA alignment requirement is not satisfied");
-
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               0,
-                                               1,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          0,
+                                                          1,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -224,11 +210,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         //     register
         constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc);
 
-        constexpr auto b_e_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(
-            in_e_n1_b_n2_block_desc.GetLength(I0),
-            in_e_n1_b_n2_block_desc.GetLength(I1) * in_e_n1_b_n2_block_desc.GetLength(I2) *
-                in_e_n1_b_n2_block_desc.GetLength(I3),
-            in_e_n1_b_n2_block_desc.GetStride(I0));
+        constexpr auto b_e_n1bn2_block_mtx_desc =
+            make_ConstantMatrixDescriptor(in_e_n1_b_n2_block_desc.Unfold(I1, I3));
 
         // sanity check
         static_assert(KPerBlock % (GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster) ==
@@ -240,14 +223,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<N1 * N2>{});
+        constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
+            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmNPerThreadSubC>{});
 
         const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
             BlockSize,
             decltype(a_e_k_block_mtx_desc),
             decltype(b_e_n1bn2_block_mtx_desc),
-            decltype(c_k0k2_n1n2_thread_mtx_desc),
+            decltype(c_k0k1_n1n2_thread_mtx_desc),
             GemmMPerThreadSubC,
             GemmNPerThreadSubC,
             GemmMLevel0Cluster,
@@ -274,10 +257,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         __shared__ Float p_wei_block_double[2 * wei_block_space];
 
         // register allocation for output
-        Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
+        Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];
 
         // zero out threadwise output
-        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
+        threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread);
 
         // LDS double buffer: preload data into LDS
         {
@@ -367,38 +350,24 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         // copy output: register to global memory
         {
             constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
-            constexpr index_t K0 = K / K1;
 
             // define tensor descriptor for threadwise copy
             //     output memory layout descriptor in register, src of threadwise copy
-            constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed(
+            constexpr auto out_k0_k1_n1_b_n2_thread_mem_desc = make_ConstantTensorDescriptor_packed(
                 Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{});
 
             //     output memory layout descriptor in device memory
-            constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor(
-                out_n_k_ho_wo_global_desc,
-                make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
-                           UnMerge<Sequence<K0, K1>>{},
-                           PassThrough<Ho>{},
-                           PassThrough<Wo>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{}));
+            constexpr auto out_n0_n1_n2_k0_k1_h_w_global_mem_desc =
+                out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}).Fold(I0, Number<N1>{}, Number<N2>{});
 
             //     output merged global tensor descriptor, dst of threadwise copy
-            constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor(
-                out_n0_n1_n2_k0_k1_ho_wo_global_desc,
-                make_tuple(PassThrough<K0>{},
-                           PassThrough<K1>{},
-                           PassThrough<N1>{},
-                           Merge<Sequence<N0, Ho, Wo>>{},
-                           PassThrough<N2>{}),
-                make_tuple(Sequence<3>{},
-                           Sequence<4>{},
-                           Sequence<1>{},
-                           Sequence<0, 5, 6>{},
-                           Sequence<2>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+            constexpr auto out_k0_k1_n1_b_n2_global_merged_desc =
+                make_ConstantMergedTensorDescriptor(out_n0_n1_n2_k0_k1_h_w_global_mem_desc,
+                                                    Sequence<3>{},
+                                                    Sequence<4>{},
+                                                    Sequence<1>{},
+                                                    Sequence<0, 5, 6>{},
+                                                    Sequence<2>{});
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -411,31 +380,26 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             const index_t b_thread_data_on_global =
                 b_block_data_on_global + c_thread_mtx_on_block.col / N2;
 
-            ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
-                                                  decltype(out_k0_k1_n1_b_n2_global_desc),
-                                                  decltype(
-                                                      out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 5, 1>::type,
-                                                  3,
-                                                  1,
-                                                  1>({0, 0, 0, 0, 0},
-                                                     {k_thread_data_on_global / K1,
-                                                      k_thread_data_on_global % K1,
-                                                      0,
-                                                      b_thread_data_on_global,
-                                                      0})
-#if 1
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
-#else // tweaking
-                .template Run_optimized_dst_address_calculation<Float,
-                                                                Float,
-                                                                AddressSpace::generic,
-                                                                AddressSpace::global>
-#endif
-                (p_out_thread, p_out_global);
+            ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
+                decltype(out_k0_k1_n1_b_n2_thread_mem_desc),
+                decltype(out_k0_k1_n1_b_n2_global_merged_desc),
+                decltype(out_k0_k1_n1_b_n2_thread_mem_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 5, 1>::type,
+                arithmetic_sequence_gen<0, 5, 1>::type,
+                3,
+                3,
+                1,
+                1>({0, 0, 0, 0, 0},
+                   {k_thread_data_on_global / K1,
+                    k_thread_data_on_global % K1,
+                    0,
+                    b_thread_data_on_global,
+                    0})
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>(
+                    p_out_thread, p_out_global);
         }
     }
 };
 
 } // namespace ck
-#endif
+#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index e741a83c4..a547db7e3 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -1,25 +1,27 @@
-#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP
-#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_HPP
+#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
+#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_HPP
 
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor_deprecated.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
+#include "threadwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
-#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
 namespace ck {
 
 // B = merge(N, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
-          class Float,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          class ConvStrides,
-          class ConvDilations,
+          typename Float,
+          typename InGlobalDesc,
+          typename WeiGlobalDesc,
+          typename OutGlobalDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -32,17 +34,17 @@ template <index_t GridSize,
           index_t GemmKPerThreadLoop,
           index_t GemmDataPerReadA,
           index_t GemmDataPerReadB,
-          class InBlockCopySubLengths_E_B,
-          class InBlockCopyClusterLengths_E_B,
-          class InBlockCopyThreadClusterArrangeOrder,
-          class InBlockCopySrcAccessOrder,
-          class InBlockCopyDstAccessOrder,
+          typename InBlockCopySubLengths_E_B,
+          typename InBlockCopyClusterLengths_E_B,
+          typename InBlockCopyThreadClusterArrangeOrder,
+          typename InBlockCopySrcAccessOrder,
+          typename InBlockCopyDstAccessOrder,
           index_t InBlockCopyDataPerAccess_B,
-          class WeiBlockCopySubLengths_E_K,
-          class WeiBlockCopyClusterLengths_E_K,
-          class WeiBlockCopyThreadClusterArrangeOrder,
-          class WeiBlockCopySrcAccessOrder,
-          class WeiBlockCopyDstAccessOrder,
+          typename WeiBlockCopySubLengths_E_K,
+          typename WeiBlockCopyClusterLengths_E_K,
+          typename WeiBlockCopyThreadClusterArrangeOrder,
+          typename WeiBlockCopySrcAccessOrder,
+          typename WeiBlockCopyDstAccessOrder,
           index_t WeiBlockCopySrcDataPerRead_E,
           index_t WeiBlockCopyDstDataPerWrite_K,
           index_t OutThreadCopyDataPerAccess_B>
@@ -56,23 +58,27 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
-        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
-        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
-        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
+        constexpr auto in_n_c_hi_wi_global_desc =
+            make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
+        constexpr auto wei_k_c_y_x_global_desc =
+            make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides());
+        constexpr auto out_n_k_ho_wo_global_desc =
+            make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides());
 
-        constexpr index_t N = in_n_c_h_w_global_desc.GetLengths()[0];
-        constexpr index_t C = in_n_c_h_w_global_desc.GetLengths()[1];
+        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLength(I0);
+        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLength(I1);
+        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
 
-        constexpr index_t K  = out_n_k_h_w_global_desc.GetLengths()[1];
-        constexpr index_t Ho = out_n_k_h_w_global_desc.GetLengths()[2];
-        constexpr index_t Wo = out_n_k_h_w_global_desc.GetLengths()[3];
+        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLength(I1);
+        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
 
-        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
-        constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
+        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
 
         constexpr index_t ConvStrideH = ConvStrides{}[0];
         constexpr index_t ConvStrideW = ConvStrides{}[1];
@@ -97,65 +103,67 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         constexpr index_t BBlockWork = B / BPerBlock;
 
         constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
 
-        const auto block_work_multi_id =
-            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
+        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
 
-        const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
-        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
+        const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
+        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
 
         // input tensor
-        //     tensor descriptor in device memory [N, Ho, Wo]
-        constexpr auto in_n_ho_wo_global_desc =
-            in_n_c_h_w_global_desc.Extract(I0, I2, I3)
-                .StridedSlice(I1, Number<Ho>{}, Number<ConvStrideH>{})
-                .StridedSlice(I2, Number<Wo>{}, Number<ConvStrideW>{});
-
-        //     batch descritpor for device memory
-        constexpr auto in_c_y_x_global_desc =
-            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Y>{}, Number<ConvDilationH>{})
-                .StridedSlice(I3, Number<X>{}, Number<ConvDilationW>{})
-                .Extract(Sequence<1, 2, 3>{});
-
-        //     merged tensor descriptor in device memory [E, B], src of blockwise copy
-        constexpr auto in_e_b_global_desc =
-            make_ConstantMergedTensorDescriptor(in_c_y_x_global_desc.Embed(in_n_ho_wo_global_desc),
-                                                Sequence<0, 1, 2>{},
-                                                Sequence<3, 4, 5>{});
-
-        //     memory layout descriptor in LDS [E, B], dst of blockwise copy
+        //   global mem
+        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(
+                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
+
+        constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(PassThrough<N>{},
+                       PassThrough<C>{},
+                       Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                       Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+        constexpr auto in_e_b_global_desc = transform_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        //   LDS mem
         //     be careful of LDS alignment
         constexpr auto in_e_b_block_desc =
-            make_ConstantTensorDescriptor_packed(Sequence<EPerBlock, BPerBlock>{});
+            make_native_tensor_descriptor_packed(Sequence<EPerBlock, BPerBlock>{});
 
         // input blockwise copy
-        //     slice a merged tensor, reorder and copy to a normal tensor
-        //     this copy operator already has blockwise offset built-in
         auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
-                                                          decltype(in_e_b_global_desc),
-                                                          decltype(in_e_b_block_desc),
-                                                          decltype(in_e_b_block_desc.GetLengths()),
-                                                          InBlockCopySubLengths_E_B,
-                                                          InBlockCopyClusterLengths_E_B,
-                                                          InBlockCopyThreadClusterArrangeOrder,
-                                                          InBlockCopySrcAccessOrder,
-                                                          InBlockCopyDstAccessOrder,
-                                                          1,
-                                                          1,
-                                                          InBlockCopyDataPerAccess_B,
-                                                          InBlockCopyDataPerAccess_B>(
+            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                               decltype(in_e_b_global_desc),
+                                               decltype(in_e_b_block_desc),
+                                               decltype(in_e_b_block_desc.GetLengths()),
+                                               InBlockCopySubLengths_E_B,
+                                               InBlockCopyClusterLengths_E_B,
+                                               InBlockCopyThreadClusterArrangeOrder,
+                                               InBlockCopySrcAccessOrder,
+                                               InBlockCopyDstAccessOrder,
+                                               1,
+                                               1,
+                                               InBlockCopyDataPerAccess_B,
+                                               InBlockCopyDataPerAccess_B>(
                 {0, b_block_data_on_global}, {0, 0});
 
         // weight tensor
-        //     tensor descriptor in device memory, src of blockwise copy
-        constexpr auto wei_e_k_global_desc =
-            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+        //   global mem
+        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
+            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
 
-        //     tensor descriptor in LDS, dst of blockwise copy
+        //   LDS
         //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
@@ -165,23 +173,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0,
                       "GemmDataPerReadA alignment requirement is not satisfied");
 
-        // operator for blockwise copy of weight into LDS
-        //     slice a tensor, and copy it into another tensor
-        //     this copy operator already have blockwise offset built-in
+        // weight blockwise copy
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
-                                                          decltype(wei_e_k_global_desc),
-                                                          decltype(wei_e_k_block_desc),
-                                                          decltype(wei_e_k_block_desc.GetLengths()),
-                                                          WeiBlockCopySubLengths_E_K,
-                                                          WeiBlockCopyClusterLengths_E_K,
-                                                          WeiBlockCopyThreadClusterArrangeOrder,
-                                                          WeiBlockCopySrcAccessOrder,
-                                                          WeiBlockCopyDstAccessOrder,
-                                                          0,
-                                                          1,
-                                                          WeiBlockCopySrcDataPerRead_E,
-                                                          WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
+                                               decltype(wei_e_k_global_desc),
+                                               decltype(wei_e_k_block_desc),
+                                               decltype(wei_e_k_block_desc.GetLengths()),
+                                               WeiBlockCopySubLengths_E_K,
+                                               WeiBlockCopyClusterLengths_E_K,
+                                               WeiBlockCopyThreadClusterArrangeOrder,
+                                               WeiBlockCopySrcAccessOrder,
+                                               WeiBlockCopyDstAccessOrder,
+                                               0,
+                                               1,
+                                               WeiBlockCopySrcDataPerRead_E,
+                                               WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -247,14 +253,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         // zero out threadwise output
         threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread);
 
-        const Float* p_wei_block_on_global = p_wei_global;
-
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, AddressSpace::global>(p_in_global,
-                                                                        p_in_block_double);
-            blockwise_wei_copy.template Run<Float, AddressSpace::global>(p_wei_global,
-                                                                         p_wei_block_double);
+            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
+                                                                               p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
+                                                                                p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -285,9 +289,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                     p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                     p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -311,9 +315,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -336,15 +340,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         // copy output: register to global memory
         {
-            constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
-            constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster;
-
-            // define tensor descriptor for threadwise copy
-            //     output global descriptor, for calculating origin of thread tensor
-            //     in global memory
-            constexpr auto out_k_b_global_desc = make_ConstantMergedTensorDescriptor(
-                out_n_k_h_w_global_desc, Sequence<1>{}, Sequence<0, 2, 3>{});
-
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
             const auto c_thread_mtx_on_block =
@@ -356,46 +351,51 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
             const index_t b_thread_data_on_global =
                 b_block_data_on_global + c_thread_mtx_on_block.col;
 
-            // This is a hack, because slicing a merged dimension is not supported yet.
-            // This should be replaced with logic above, once slicing a merged dimension support
-            // become available
-            //     dst descriptor
-            constexpr auto out_k0_k1_b_global_desc =
-                make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
-                                                    Sequence<1>{},
-                                                    Sequence<2>{},
-                                                    Sequence<0, 3, 4>{});
-
-            //     src descriptor
-            constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
-
-            using OutThreadCopySliceLengths =
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>;
-
-            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
-                decltype(out_k0_k1_b_thread_desc),
-                decltype(out_k0_k1_b_global_desc),
-                OutThreadCopySliceLengths,
-                arithmetic_sequence_gen<0, 3, 1>::type,
-                arithmetic_sequence_gen<0, 3, 1>::type,
-                2,
-                2,
+            // src descriptor
+            constexpr auto out_k0_k1_b0_b1_thread_desc = make_native_tensor_descriptor_packed(
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat, GemmNPerThreadSubC>{});
+
+            // dst descriptor
+            constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
+            constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster;
+
+            constexpr index_t K0 = K / K1;
+            constexpr index_t B0 = B / B1;
+
+            constexpr auto out_k_b_global_desc = transform_tensor_descriptor(
+                out_n_k_ho_wo_global_desc,
+                make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho, Wo>>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor(
+                out_k_b_global_desc,
+                make_tuple(UnMerge<Sequence<K0, K1>>{}, UnMerge<Sequence<B0, B1>>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+            // output threadwise copy
+            ThreadwiseGenericTensorSliceCopy_v4r2<
+                decltype(out_k0_k1_b0_b1_thread_desc),
+                decltype(out_k0_k1_b0_b1_global_desc),
+                decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 4, 1>::type,
+                3,
                 OutThreadCopyDataPerAccess_B,
-                OutThreadCopyDataPerAccess_B>({0, 0, 0},
+                OutThreadCopyDataPerAccess_B>({0, 0, 0, 0},
                                               {k_thread_data_on_global / K1,
                                                k_thread_data_on_global % K1,
-                                               b_thread_data_on_global});
-
-            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
-            {
-                threadwise_out_copy
-                    .template Run<Float, AddressSpace::generic, AddressSpace::global>(p_out_thread,
-                                                                                      p_out_global);
-
-                threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
-                threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
-            }
+                                               b_thread_data_on_global / B1,
+                                               b_thread_data_on_global % B1})
+#if 1
+                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
+#else // tweaking
+                .template Run_optimized_dst_address_calculation<Float,
+                                                                Float,
+                                                                AddressSpace::generic,
+                                                                AddressSpace::global>
+#endif
+                (p_out_thread, p_out_global);
         }
     }
 };
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
similarity index 56%
rename from composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
rename to composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
index e93258682..8b3f8445d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
@@ -1,27 +1,25 @@
-#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
-#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
+#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_DEPRECATRD_HPP
+#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP_LDS_DOUBLE_BUFFER_DEPRECATRD_HPP
 
 #include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
+#include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
 #include "blockwise_gemm.hpp"
+#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
 
 namespace ck {
 
 // B = merge(N, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
-          typename Float,
-          typename InGlobalDesc,
-          typename WeiGlobalDesc,
-          typename OutGlobalDesc,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads,
+          class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          class ConvStrides,
+          class ConvDilations,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -34,21 +32,21 @@ template <index_t GridSize,
           index_t GemmKPerThreadLoop,
           index_t GemmDataPerReadA,
           index_t GemmDataPerReadB,
-          typename InBlockCopySubLengths_E_B,
-          typename InBlockCopyClusterLengths_E_B,
-          typename InBlockCopyThreadClusterArrangeOrder,
-          typename InBlockCopySrcAccessOrder,
-          typename InBlockCopyDstAccessOrder,
+          class InBlockCopySubLengths_E_B,
+          class InBlockCopyClusterLengths_E_B,
+          class InBlockCopyThreadClusterArrangeOrder,
+          class InBlockCopySrcAccessOrder,
+          class InBlockCopyDstAccessOrder,
           index_t InBlockCopyDataPerAccess_B,
-          typename WeiBlockCopySubLengths_E_K,
-          typename WeiBlockCopyClusterLengths_E_K,
-          typename WeiBlockCopyThreadClusterArrangeOrder,
-          typename WeiBlockCopySrcAccessOrder,
-          typename WeiBlockCopyDstAccessOrder,
+          class WeiBlockCopySubLengths_E_K,
+          class WeiBlockCopyClusterLengths_E_K,
+          class WeiBlockCopyThreadClusterArrangeOrder,
+          class WeiBlockCopySrcAccessOrder,
+          class WeiBlockCopyDstAccessOrder,
           index_t WeiBlockCopySrcDataPerRead_E,
           index_t WeiBlockCopyDstDataPerWrite_K,
           index_t OutThreadCopyDataPerAccess_B>
-struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
+struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated
 {
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
@@ -58,27 +56,23 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
+        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
-        constexpr auto in_n_c_hi_wi_global_desc =
-            make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
-        constexpr auto wei_k_c_y_x_global_desc =
-            make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides());
-        constexpr auto out_n_k_ho_wo_global_desc =
-            make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides());
+        constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
+        constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
+        constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
 
-        constexpr index_t N  = in_n_c_hi_wi_global_desc.GetLength(I0);
-        constexpr index_t C  = in_n_c_hi_wi_global_desc.GetLength(I1);
-        constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-        constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+        constexpr index_t N = in_n_c_h_w_global_desc.GetLengths()[0];
+        constexpr index_t C = in_n_c_h_w_global_desc.GetLengths()[1];
 
-        constexpr index_t K  = out_n_k_ho_wo_global_desc.GetLength(I1);
-        constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
-        constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+        constexpr index_t K  = out_n_k_h_w_global_desc.GetLengths()[1];
+        constexpr index_t Ho = out_n_k_h_w_global_desc.GetLengths()[2];
+        constexpr index_t Wo = out_n_k_h_w_global_desc.GetLengths()[3];
 
-        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-        constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
+        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
+        constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
 
         constexpr index_t ConvStrideH = ConvStrides{}[0];
         constexpr index_t ConvStrideW = ConvStrides{}[1];
@@ -103,67 +97,65 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr index_t BBlockWork = B / BPerBlock;
 
         constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});
 
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
+        const auto block_work_multi_id =
+            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
 
-        const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
-        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
+        const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock;
+        const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock;
 
         // input tensor
-        //   global mem
-        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(
-                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
-
-        constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(PassThrough<N>{},
-                       PassThrough<C>{},
-                       Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
-                       Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-
-        constexpr auto in_e_b_global_desc = transform_tensor_descriptor(
-            in_n_c_y_ho_x_wo_global_desc,
-            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-        //   LDS mem
+        //     tensor descriptor in device memory [N, Ho, Wo]
+        constexpr auto in_n_ho_wo_global_desc =
+            in_n_c_h_w_global_desc.Extract(I0, I2, I3)
+                .StridedSlice(I1, Number<Ho>{}, Number<ConvStrideH>{})
+                .StridedSlice(I2, Number<Wo>{}, Number<ConvStrideW>{});
+
+        //     batch descritpor for device memory
+        constexpr auto in_c_y_x_global_desc =
+            in_n_c_h_w_global_desc.StridedSlice(I2, Number<Y>{}, Number<ConvDilationH>{})
+                .StridedSlice(I3, Number<X>{}, Number<ConvDilationW>{})
+                .Extract(Sequence<1, 2, 3>{});
+
+        //     merged tensor descriptor in device memory [E, B], src of blockwise copy
+        constexpr auto in_e_b_global_desc =
+            make_ConstantMergedTensorDescriptor(in_c_y_x_global_desc.Embed(in_n_ho_wo_global_desc),
+                                                Sequence<0, 1, 2>{},
+                                                Sequence<3, 4, 5>{});
+
+        //     memory layout descriptor in LDS [E, B], dst of blockwise copy
         //     be careful of LDS alignment
         constexpr auto in_e_b_block_desc =
-            make_native_tensor_descriptor_packed(Sequence<EPerBlock, BPerBlock>{});
+            make_ConstantTensorDescriptor_packed(Sequence<EPerBlock, BPerBlock>{});
 
         // input blockwise copy
+        //     slice a merged tensor, reorder and copy to a normal tensor
+        //     this copy operator already has blockwise offset built-in
         auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(in_e_b_global_desc),
-                                               decltype(in_e_b_block_desc),
-                                               decltype(in_e_b_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_E_B,
-                                               InBlockCopyClusterLengths_E_B,
-                                               InBlockCopyThreadClusterArrangeOrder,
-                                               InBlockCopySrcAccessOrder,
-                                               InBlockCopyDstAccessOrder,
-                                               1,
-                                               1,
-                                               InBlockCopyDataPerAccess_B,
-                                               InBlockCopyDataPerAccess_B>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(in_e_b_global_desc),
+                                                          decltype(in_e_b_block_desc),
+                                                          decltype(in_e_b_block_desc.GetLengths()),
+                                                          InBlockCopySubLengths_E_B,
+                                                          InBlockCopyClusterLengths_E_B,
+                                                          InBlockCopyThreadClusterArrangeOrder,
+                                                          InBlockCopySrcAccessOrder,
+                                                          InBlockCopyDstAccessOrder,
+                                                          1,
+                                                          1,
+                                                          InBlockCopyDataPerAccess_B,
+                                                          InBlockCopyDataPerAccess_B>(
                 {0, b_block_data_on_global}, {0, 0});
 
         // weight tensor
-        //   global mem
-        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
-            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
+        //     tensor descriptor in device memory, src of blockwise copy
+        constexpr auto wei_e_k_global_desc =
+            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
 
-        //   LDS
+        //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
@@ -173,21 +165,23 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
         static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0,
                       "GemmDataPerReadA alignment requirement is not satisfied");
 
-        // weight blockwise copy
+        // operator for blockwise copy of weight into LDS
+        //     slice a tensor, and copy it into another tensor
+        //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
-                                               0,
-                                               1,
-                                               WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+            BlockwiseGenericTensorSliceCopy_v2_deprecated<BlockSize,
+                                                          decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
+                                                          WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
+                                                          0,
+                                                          1,
+                                                          WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
@@ -253,12 +247,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
         // zero out threadwise output
         threadwise_matrix_set_zero(c_k0k1_b0b1_thread_mtx_desc, p_out_thread);
 
+        const Float* p_wei_block_on_global = p_wei_global;
+
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
-                                                                               p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
-                                                                                p_wei_block_double);
+            blockwise_in_copy.template Run<Float, AddressSpace::global>(p_in_global,
+                                                                        p_in_block_double);
+            blockwise_wei_copy.template Run<Float, AddressSpace::global>(p_wei_global,
+                                                                         p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -289,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                     p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                     p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
@@ -315,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, AddressSpace::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -340,6 +336,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // copy output: register to global memory
         {
+            constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
+            constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster;
+
+            // define tensor descriptor for threadwise copy
+            //     output global descriptor, for calculating origin of thread tensor
+            //     in global memory
+            constexpr auto out_k_b_global_desc = make_ConstantMergedTensorDescriptor(
+                out_n_k_h_w_global_desc, Sequence<1>{}, Sequence<0, 2, 3>{});
+
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
             const auto c_thread_mtx_on_block =
@@ -351,51 +356,46 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             const index_t b_thread_data_on_global =
                 b_block_data_on_global + c_thread_mtx_on_block.col;
 
-            // src descriptor
-            constexpr auto out_k0_k1_b0_b1_thread_desc = make_native_tensor_descriptor_packed(
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat, GemmNPerThreadSubC>{});
-
-            // dst descriptor
-            constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
-            constexpr index_t B1 = GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster;
-
-            constexpr index_t K0 = K / K1;
-            constexpr index_t B0 = B / B1;
-
-            constexpr auto out_k_b_global_desc = transform_tensor_descriptor(
-                out_n_k_ho_wo_global_desc,
-                make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho, Wo>>{}),
-                make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor(
-                out_k_b_global_desc,
-                make_tuple(UnMerge<Sequence<K0, K1>>{}, UnMerge<Sequence<B0, B1>>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-
-            // output threadwise copy
-            ThreadwiseGenericTensorSliceCopy_v4r2<
-                decltype(out_k0_k1_b0_b1_thread_desc),
-                decltype(out_k0_k1_b0_b1_global_desc),
-                decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()),
-                arithmetic_sequence_gen<0, 4, 1>::type,
-                3,
+            // This is a hack, because slicing a merged dimension is not supported yet.
+            // This should be replaced with logic above, once slicing a merged dimension support
+            // become available
+            //     dst descriptor
+            constexpr auto out_k0_k1_b_global_desc =
+                make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
+                                                    Sequence<1>{},
+                                                    Sequence<2>{},
+                                                    Sequence<0, 3, 4>{});
+
+            //     src descriptor
+            constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
+
+            using OutThreadCopySliceLengths =
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>;
+
+            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<
+                decltype(out_k0_k1_b_thread_desc),
+                decltype(out_k0_k1_b_global_desc),
+                OutThreadCopySliceLengths,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                2,
+                2,
                 OutThreadCopyDataPerAccess_B,
-                OutThreadCopyDataPerAccess_B>({0, 0, 0, 0},
+                OutThreadCopyDataPerAccess_B>({0, 0, 0},
                                               {k_thread_data_on_global / K1,
                                                k_thread_data_on_global % K1,
-                                               b_thread_data_on_global / B1,
-                                               b_thread_data_on_global % B1})
-#if 1
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
-#else // tweaking
-                .template Run_optimized_dst_address_calculation<Float,
-                                                                Float,
-                                                                AddressSpace::generic,
-                                                                AddressSpace::global>
-#endif
-                (p_out_thread, p_out_global);
+                                               b_thread_data_on_global});
+
+            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
+            {
+                threadwise_out_copy
+                    .template Run<Float, AddressSpace::generic, AddressSpace::global>(p_out_thread,
+                                                                                      p_out_global);
+
+                threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True);
+                threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True);
+            }
         }
     }
 };
diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in
index 337d12fa3..06368305d 100644
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -1,8 +1,6 @@
 #ifndef CK_FLOAT_TYPE_AMD_HPP
 #define CK_FLOAT_TYPE_AMD_HPP
 
-#include "bfloat16_dev.hpp"
-
 namespace ck {
 
 // For some reason, HIP compiler need this definition to generate optimal ISA
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index 56420cfb7..d361db801 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -5,12 +5,14 @@
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 
-template <class T,
-          class InDesc,
-          class WeiDesc,
-          class OutDesc,
-          class ConvStrides,
-          class ConvDilations>
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
 void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                                                           const Tensor<T>& in_nchw,
                                                           WeiDesc,
@@ -19,6 +21,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                                                           Tensor<T>& out_nkhw,
                                                           ConvStrides,
                                                           ConvDilations,
+                                                          LeftPads,
+                                                          RightPads,
                                                           ck::index_t nrepeat)
 {
     using namespace ck;
@@ -28,9 +32,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
+    constexpr auto in_nchw_desc =
+        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
+    constexpr auto wei_kcyx_desc =
+        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
+    constexpr auto out_nkhw_desc =
+        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
 
     constexpr index_t N  = out_nkhw_desc.GetLength(I0);
     constexpr index_t K  = out_nkhw_desc.GetLength(I1);
@@ -47,7 +54,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
 #if 1
-    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
+    // BlockSize = 256, each thread hold 64 data
     constexpr index_t BlockSize = 256;
 
     constexpr index_t BPerBlock = 16;
@@ -84,7 +91,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
-    // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
+    // BlockSize = 64, each thread hold 64 data
     constexpr index_t BlockSize = 64;
 
     constexpr index_t BPerBlock = 8;
@@ -120,7 +127,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
 
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
     // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
     constexpr index_t BlockSize = 256;
 
@@ -170,42 +177,48 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
 
     constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer<
-            GridSize,
-            BlockSize,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            BPerBlock,
-            KPerBlock,
-            EPerBlock,
-            GemmNRepeat,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
-            GemmDataPerReadA,
-            GemmDataPerReadB,
-            InBlockCopySubLengths_E_N1_B_N2,
-            InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            InBlockCopySrcDataPerRead_B,
-            InBlockCopyDstDataPerWrite_N2,
-            WeiBlockCopySubLengths_E_K,
-            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
+#if 0
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
+#else
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
+#endif
+        <GridSize,
+         BlockSize,
+         T,
+         decltype(in_nchw_desc),
+         decltype(wei_kcyx_desc),
+         decltype(out_nkhw_desc),
+         ConvStrides,
+         ConvDilations,
+         LeftPads,
+         RightPads,
+         BPerBlock,
+         KPerBlock,
+         EPerBlock,
+         GemmNRepeat,
+         GemmMPerThreadSubC,
+         GemmNPerThreadSubC,
+         GemmMLevel0Cluster,
+         GemmNLevel0Cluster,
+         GemmMLevel1Cluster,
+         GemmNLevel1Cluster,
+         GemmKPerThreadLoop,
+         GemmDataPerReadA,
+         GemmDataPerReadB,
+         InBlockCopySubLengths_E_N1_B_N2,
+         InBlockCopyClusterLengths_E_N1_B_N2,
+         InBlockCopyThreadClusterArrangeOrder,
+         InBlockCopySrcAccessOrder,
+         InBlockCopyDstAccessOrder,
+         InBlockCopySrcDataPerRead_B,
+         InBlockCopyDstDataPerWrite_N2,
+         WeiBlockCopySubLengths_E_K,
+         WeiBlockCopyClusterLengths_E_K,
+         WeiBlockCopyThreadClusterArrangeOrder,
+         WeiBlockCopySrcAccessOrder,
+         WeiBlockCopyDstAccessOrder,
+         WeiBlockCopySrcDataPerRead_E,
+         WeiBlockCopyDstDataPerWrite_K>{};
 
     for(index_t i = 0; i < nrepeat; ++i)
     {
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
similarity index 76%
rename from driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
rename to driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
index b4dd0558f..5a47feb6e 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
@@ -3,27 +3,23 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
-
-template <typename T,
-          typename InDesc,
-          typename WeiDesc,
-          typename OutDesc,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
-                                                                 const Tensor<T>& in_nchw,
-                                                                 WeiDesc,
-                                                                 const Tensor<T>& wei_kcyx,
-                                                                 OutDesc,
-                                                                 Tensor<T>& out_nkhw,
-                                                                 ConvStrides,
-                                                                 ConvDilations,
-                                                                 LeftPads,
-                                                                 RightPads,
-                                                                 ck::index_t nrepeat)
+#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
+
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations>
+void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
+                                                                     const Tensor<T>& in_nchw,
+                                                                     WeiDesc,
+                                                                     const Tensor<T>& wei_kcyx,
+                                                                     OutDesc,
+                                                                     Tensor<T>& out_nkhw,
+                                                                     ConvStrides,
+                                                                     ConvDilations,
+                                                                     ck::index_t nrepeat)
 {
     using namespace ck;
 
@@ -32,12 +28,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_nchw_desc =
-        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
-    constexpr auto wei_kcyx_desc =
-        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
-    constexpr auto out_nkhw_desc =
-        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
 
     constexpr index_t N  = out_nkhw_desc.GetLength(I0);
     constexpr index_t K  = out_nkhw_desc.GetLength(I1);
@@ -54,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
 #if 1
-    // BlockSize = 256, each thread hold 64 data
+    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
     constexpr index_t BlockSize = 256;
 
     constexpr index_t BPerBlock = 16;
@@ -91,7 +84,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
-    // BlockSize = 64, each thread hold 64 data
+    // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
     constexpr index_t BlockSize = 64;
 
     constexpr index_t BPerBlock = 8;
@@ -127,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
 
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
     // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
     constexpr index_t BlockSize = 256;
 
@@ -177,48 +170,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
     printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
 
     constexpr auto gridwise_conv =
-#if 0
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
-#else
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
-#endif
-        <GridSize,
-         BlockSize,
-         T,
-         decltype(in_nchw_desc),
-         decltype(wei_kcyx_desc),
-         decltype(out_nkhw_desc),
-         ConvStrides,
-         ConvDilations,
-         LeftPads,
-         RightPads,
-         BPerBlock,
-         KPerBlock,
-         EPerBlock,
-         GemmNRepeat,
-         GemmMPerThreadSubC,
-         GemmNPerThreadSubC,
-         GemmMLevel0Cluster,
-         GemmNLevel0Cluster,
-         GemmMLevel1Cluster,
-         GemmNLevel1Cluster,
-         GemmKPerThreadLoop,
-         GemmDataPerReadA,
-         GemmDataPerReadB,
-         InBlockCopySubLengths_E_N1_B_N2,
-         InBlockCopyClusterLengths_E_N1_B_N2,
-         InBlockCopyThreadClusterArrangeOrder,
-         InBlockCopySrcAccessOrder,
-         InBlockCopyDstAccessOrder,
-         InBlockCopySrcDataPerRead_B,
-         InBlockCopyDstDataPerWrite_N2,
-         WeiBlockCopySubLengths_E_K,
-         WeiBlockCopyClusterLengths_E_K,
-         WeiBlockCopyThreadClusterArrangeOrder,
-         WeiBlockCopySrcAccessOrder,
-         WeiBlockCopyDstAccessOrder,
-         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K>{};
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated<
+            GridSize,
+            BlockSize,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmNRepeat,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K>{};
 
     for(index_t i = 0; i < nrepeat; ++i)
     {
diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 98e5bf489..df8073917 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -5,14 +5,14 @@
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 
-using namespace ck;
-
 template <class T,
           class InDesc,
           class WeiDesc,
           class OutDesc,
           class ConvStrides,
-          class ConvDilations>
+          class ConvDilations,
+          class LeftPads,
+          class RightPads>
 void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                                                           const Tensor<T>& in_nchw,
                                                           WeiDesc,
@@ -21,8 +21,12 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                                                           Tensor<T>& out_nkhw,
                                                           ConvStrides,
                                                           ConvDilations,
+                                                          LeftPads,
+                                                          RightPads,
                                                           ck::index_t nrepeat)
 {
+    using namespace ck;
+
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
@@ -164,7 +168,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
 
     constexpr auto gridwise_conv =
 #if 0
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
 #else
         GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 #endif
@@ -176,6 +180,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
          decltype(out_nkhw_desc),
          ConvStrides,
          ConvDilations,
+         LeftPads,
+         RightPads,
          BPerBlock,
          KPerBlock,
          EPerBlock,
diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp
similarity index 92%
rename from driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
rename to driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp
index 86ba43d7d..cb51bfc1d 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp
@@ -3,30 +3,26 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
+
+using namespace ck;
 
 template <class T,
           class InDesc,
           class WeiDesc,
           class OutDesc,
           class ConvStrides,
-          class ConvDilations,
-          class LeftPads,
-          class RightPads>
-void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
-                                                                 const Tensor<T>& in_nchw,
-                                                                 WeiDesc,
-                                                                 const Tensor<T>& wei_kcyx,
-                                                                 OutDesc,
-                                                                 Tensor<T>& out_nkhw,
-                                                                 ConvStrides,
-                                                                 ConvDilations,
-                                                                 LeftPads,
-                                                                 RightPads,
-                                                                 ck::index_t nrepeat)
+          class ConvDilations>
+void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(InDesc,
+                                                                     const Tensor<T>& in_nchw,
+                                                                     WeiDesc,
+                                                                     const Tensor<T>& wei_kcyx,
+                                                                     OutDesc,
+                                                                     Tensor<T>& out_nkhw,
+                                                                     ConvStrides,
+                                                                     ConvDilations,
+                                                                     ck::index_t nrepeat)
 {
-    using namespace ck;
-
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
@@ -168,9 +164,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
 
     constexpr auto gridwise_conv =
 #if 0
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
 #else
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated
 #endif
         <GridSize,
          BlockSize,
@@ -180,8 +176,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
          decltype(out_nkhw_desc),
          ConvStrides,
          ConvDilations,
-         LeftPads,
-         RightPads,
          BPerBlock,
          KPerBlock,
          EPerBlock,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 9d47b96f9..f11183750 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -14,12 +14,12 @@
 //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
 
 struct GeneratorTensor_1
 {
@@ -326,7 +326,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<2, 2>;
     using RightPads = Sequence<2, 2>;
-#elif 0
+#elif 1
     // 7x1 filter, 3x0 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -439,6 +439,16 @@ int main(int argc, char* argv[])
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0
+    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
+                                                                    in_nchw,
+                                                                    wei_kcyx_desc,
+                                                                    wei_kcyx,
+                                                                    out_nkhw_desc,
+                                                                    out_nkhw_device,
+                                                                    ConvStrides{},
+                                                                    ConvDilations{},
+                                                                    nrepeat);
+#elif 1
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
                                                          wei_kcyx_desc,
@@ -447,19 +457,9 @@ int main(int argc, char* argv[])
                                                          out_nkhw_device,
                                                          ConvStrides{},
                                                          ConvDilations{},
+                                                         LeftPads{},
+                                                         RightPads{},
                                                          nrepeat);
-#elif 1
-    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
 #elif 0
     device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
@@ -481,6 +481,16 @@ int main(int argc, char* argv[])
                                                          ConvDilations{},
                                                          nrepeat);
 #elif 0
+    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
+                                                                    in_nchw,
+                                                                    wei_kcyx_desc,
+                                                                    wei_kcyx,
+                                                                    out_nkhw_desc,
+                                                                    out_nkhw_device,
+                                                                    ConvStrides{},
+                                                                    ConvDilations{},
+                                                                    nrepeat);
+#elif 1
     device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
                                                          wei_kcyx_desc,
@@ -489,19 +499,9 @@ int main(int argc, char* argv[])
                                                          out_nkhw_device,
                                                          ConvStrides{},
                                                          ConvDilations{},
+                                                         LeftPads{},
+                                                         RightPads{},
                                                          nrepeat);
-#elif 1
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
 #endif
 
     if(do_verification)

From 15fd8d22dd4d8ac7c3c9a6375f6e8707b37bb392 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 7 Oct 2019 10:31:24 -0500
Subject: [PATCH 10/20] update amd build script

---
 driver/src/driver.cpp | 4 ++--
 script/compile-hip.sh | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index f11183750..006d13bc6 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -326,7 +326,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<2, 2>;
     using RightPads = Sequence<2, 2>;
-#elif 1
+#elif 0
     // 7x1 filter, 3x0 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
diff --git a/script/compile-hip.sh b/script/compile-hip.sh
index 5a61bc138..bae4d677e 100755
--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
+ export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
  export KMDUMPISA=1
  export KMDUMPLLVM=1
-#export KMOPTLLC="-mattr=+enable-ds128"
- export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
+ export KMDUMPDIR=$PWD
 
-make -j driver
-/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
+ make -j driver
+#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm

From 0afc27e9d13072ecc8324eaf7063dafeeb064b32 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 7 Oct 2019 10:47:50 -0500
Subject: [PATCH 11/20] miopen_integration

---
 .../kernel_algorithm/convolution_common.hpp   |  14 ++
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  72 ++++--
 .../print_tensor_descriptor.hpp               | 173 +++++++++++++
 .../tensor_description/tensor_coordinate.hpp  |   4 +-
 .../tensor_coordinate_deprecated.hpp          |  14 +-
 .../tensor_coordinate_helper.hpp              |   4 +-
 .../tensor_descriptor_helper.hpp              | 236 +++---------------
 ...e_generic_tensor_slice_copy_deprecated.hpp |  16 +-
 .../threadwise_generic_tensor_slice_copy.hpp  |   8 -
 ...e_generic_tensor_slice_copy_deprecated.hpp |  62 ++---
 .../include/utility/amd_inline_asm.hpp        |  29 +++
 .../include/utility/amd_intrinsic.hpp         |  48 ++--
 .../include/utility/config.amd.hpp.in         |  21 +-
 .../include/utility/float_type.hpp            | 108 ++++++++
 .../include/utility/print_array.hpp           | 177 +++++++++++++
 .../include/utility/print_sequence.hpp        |  46 ++++
 .../include/utility/vector_type.hpp           | 141 ++++++++++-
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |  83 +++---
 driver/src/driver.cpp                         |   6 +-
 19 files changed, 887 insertions(+), 375 deletions(-)
 create mode 100644 composable_kernel/include/kernel_algorithm/convolution_common.hpp
 create mode 100644 composable_kernel/include/tensor_description/print_tensor_descriptor.hpp
 create mode 100644 composable_kernel/include/utility/float_type.hpp
 create mode 100644 composable_kernel/include/utility/print_array.hpp
 create mode 100644 composable_kernel/include/utility/print_sequence.hpp

diff --git a/composable_kernel/include/kernel_algorithm/convolution_common.hpp b/composable_kernel/include/kernel_algorithm/convolution_common.hpp
new file mode 100644
index 000000000..4bcb3347a
--- /dev/null
+++ b/composable_kernel/include/kernel_algorithm/convolution_common.hpp
@@ -0,0 +1,14 @@
+#ifndef CK_CONVOLUTION_COMMON_HPP
+#define CK_CONVOLUTION_COMMON_HPP
+
+namespace ck {
+
+enum ConvolutionDirection
+{
+    Forward,
+    BackwardData,
+    BackwardWeight
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 724a042c9..f3a98d773 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -8,13 +8,14 @@
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
+#include "convolution_common.hpp"
 
 namespace ck {
 
-// define B = merge(N0, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
           typename Float,
+          typename AccDataType,
           typename InGlobalDesc,
           typename WeiGlobalDesc,
           typename OutGlobalDesc,
@@ -22,6 +23,7 @@ template <index_t GridSize,
           typename ConvDilations,
           typename LeftPads,
           typename RightPads,
+          ConvolutionDirection ConvDirection,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -51,10 +53,57 @@ template <index_t GridSize,
           index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 {
+    template <ConvolutionDirection>
+    struct make_wei_e_k_global_desc;
+
+    template <>
+    struct make_wei_e_k_global_desc<ConvolutionDirection::Forward>
+    {
+        template <typename WeiDesc>
+        __device__ constexpr auto operator()(WeiDesc) const
+        {
+            constexpr auto I1 = Number<1>{};
+            constexpr auto I3 = Number<3>{};
+
+            return reorder_tensor_descriptor_given_upper2lower(
+                unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{});
+        }
+    };
+
+    template <>
+    struct make_wei_e_k_global_desc<ConvolutionDirection::BackwardWeight>
+    {
+        template <typename WeiDesc>
+        __device__ constexpr auto operator()(WeiDesc) const
+        {
+            constexpr auto I0 = Number<0>{};
+            constexpr auto I1 = Number<1>{};
+            constexpr auto I2 = Number<2>{};
+            constexpr auto I3 = Number<3>{};
+
+            constexpr auto wei_k_c_y_x_global_desc = WeiDesc{};
+
+            constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0);
+            constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1);
+            constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
+            constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+            return transform_tensor_descriptor(
+                unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3),
+                make_tuple(Merge<Sequence<C, Y * X>>{}, PassThrough<K>{}),
+                make_tuple(Sequence<1, 2>{}, Sequence<0>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    };
+
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
     {
+        static_assert(ConvDirection == ConvolutionDirection::Forward ||
+                          ConvDirection == ConvolutionDirection::BackwardWeight,
+                      "wrong! this kernel only support convolution forward and backward-weight");
+
         // this is a mess
         // TODO: find more elegent way of specifying (or calculating) performance parameters
         constexpr index_t N1 = GemmNRepeat;
@@ -181,9 +230,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
-        //     tensor descriptor in device memory, src of blockwise copy
-        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
-            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
+        //     Tensor descriptor in device memory, src of blockwise copy
+        //     It is constructed differently, depending on whether forward or backward weight
+        //       convolution
+        constexpr auto wei_e_k_global_desc =
+            make_wei_e_k_global_desc<ConvDirection>{}(wei_k_c_y_x_global_desc);
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
@@ -274,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         __shared__ Float p_wei_block_double[2 * wei_block_space];
 
         // register allocation for output
-        Float p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
+        AccDataType p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
 
         // zero out threadwise output
         threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
@@ -424,15 +475,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                                                       0,
                                                       b_thread_data_on_global,
                                                       0})
-#if 1
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
-#else // tweaking
-                .template Run_optimized_dst_address_calculation<Float,
-                                                                Float,
-                                                                AddressSpace::generic,
-                                                                AddressSpace::global>
-#endif
-                (p_out_thread, p_out_global);
+                .template Run<AccDataType, Float, AddressSpace::generic, AddressSpace::global>(
+                    p_out_thread, p_out_global);
         }
     }
 };
diff --git a/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp b/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp
new file mode 100644
index 000000000..89174e27b
--- /dev/null
+++ b/composable_kernel/include/tensor_description/print_tensor_descriptor.hpp
@@ -0,0 +1,173 @@
+#ifndef CK_PRINT_TENSOR_DESCRIPTOR_HPP
+#define CK_PRINT_TENSOR_DESCRIPTOR_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+
+namespace ck {
+
+template <typename... NativeDimensions>
+__host__ __device__ void
+print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
+{
+    print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
+}
+
+template <typename... Ts>
+__host__ __device__ void print_tensor_descriptor(const char* s,
+                                                 const TransformedTensorDescriptor<Ts...>& desc)
+{
+    print_tensor_descriptor_impl(s, desc.GetLengths());
+}
+
+template <index_t... Lengths, index_t... Strides>
+__host__ __device__ void
+print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
+{
+    constexpr index_t nDim = sizeof...(Lengths);
+
+    static_assert(nDim > 0 && nDim <= 12, "wrong!");
+
+    static_if<nDim == 1>{}([&](auto) {
+        printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
+    });
+
+    static_if<nDim == 2>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
+    });
+
+    static_if<nDim == 3>{}([&](auto) {
+        printf(
+            "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
+    });
+
+    static_if<nDim == 4>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 5>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 6>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 7>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 8>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 9>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
+               "%u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 10>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 11>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
+               "%u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+
+    static_if<nDim == 12>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
+               "%u %u %u %u "
+               "%u %u %u}\n",
+               s,
+               nDim,
+               Lengths...,
+               Strides...);
+    });
+}
+
+template <index_t... Lengths>
+__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
+{
+    constexpr index_t nDim = sizeof...(Lengths);
+
+    static_assert(nDim > 0 && nDim <= 12, "wrong!");
+
+    static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 2>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 3>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 4>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 5>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 6>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
+
+    static_if<nDim == 7>{}(
+        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
+
+    static_if<nDim == 8>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+
+    static_if<nDim == 9>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+
+    static_if<nDim == 10>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+
+    static_if<nDim == 11>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+
+    static_if<nDim == 12>{}([&](auto) {
+        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
+    });
+}
+
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index ae7e58778..5ce5bc700 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_TENSOR_COORDINATE_V2_HPP
-#define CK_TENSOR_COORDINATE_V2_HPP
+#ifndef CK_TENSOR_COORDINATE_HPP
+#define CK_TENSOR_COORDINATE_HPP
 
 #include "common_header.hpp"
 #include "dimension.hpp"
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
index aaddc1251..69659445a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -97,16 +97,17 @@ struct NormalTensorCoordinate_deprecated
 
 // TensorDesc is ConstantMergedTensorDescriptor_deprecated
 template <class TensorDesc>
-struct MergedTensorCoordinate
+struct MergedTensorCoordinate_deprecated
 {
-    using type             = MergedTensorCoordinate;
+    using type             = MergedTensorCoordinate_deprecated;
     using tensor_desc_type = TensorDesc;
 
     static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
     static constexpr index_t nOriginalDim =
         tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
 
-    __host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
+    __host__
+        __device__ constexpr MergedTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
         : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
     {
         // partial offset on each dimension
@@ -127,8 +128,8 @@ struct MergedTensorCoordinate
     }
 
     template <class... Xs>
-    __host__ __device__ constexpr MergedTensorCoordinate(Xs... xs)
-        : MergedTensorCoordinate(Array<index_t, nDim>{xs...})
+    __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs)
+        : MergedTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
     {
     }
 
@@ -335,7 +336,8 @@ struct TensorCoordinate_deprecated
     __host__ __device__ static constexpr auto
         MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
     {
-        return MergedTensorCoordinate<ConstantMergedTensorDescriptor_deprecated<Ts...>>();
+        return MergedTensorCoordinate_deprecated<
+            ConstantMergedTensorDescriptor_deprecated<Ts...>>();
     }
 
     public:
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
index 93cb077c2..2cacb329c 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
@@ -1,13 +1,13 @@
 #ifndef CK_TENSOR_COORDINATE_HELPER_HPP
 #define CK_TENSOR_COORDINATE_HELPER_HPP
 
-#include "tensor_coordiante_v2.hpp"
+#include "tensor_coordiante_hpp"
 
 namespace ck {
 
 template <typename TensorDesc>
 __host__ __device__ constexpr auto
-make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
+make_tensor_coordinate(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
 {
     return typename TensorCoordinate<TensorDesc>::type(idx);
 }
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 65fe69850..d7ef38672 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -63,10 +63,11 @@ template <typename LowerTensorDescriptor,
           index_t... LowerLengths,
           index_t... LowerDimensionIds,
           index_t... UpperDimensionIds>
-__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor,
-                                                                  Sequence<LowerLengths...>,
-                                                                  Sequence<LowerDimensionIds...>,
-                                                                  Sequence<UpperDimensionIds...>)
+__host__ __device__ constexpr auto
+    reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
+                                               Sequence<LowerLengths...>,
+                                               Sequence<LowerDimensionIds...>,
+                                               Sequence<UpperDimensionIds...>)
 {
     return TransformedTensorDescriptor<LowerTensorDescriptor,
                                        Tuple<PassThrough<LowerLengths>...>,
@@ -74,17 +75,40 @@ __host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDes
                                        Tuple<Sequence<UpperDimensionIds>...>>{};
 }
 
-template <typename LowerTensorDescriptor, typename MapLower2Upper>
+// reorder a NativeTensorDescriptor
+template <typename... Ts, typename MapLower2Upper>
+__host__ __device__ constexpr auto
+    reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
+{
+    static_assert(is_valid_sequence_map<MapLower2Upper>{},
+                  "wrong! MapLower2Upper is not a valid map");
+
+    constexpr auto old_desc = NativeTensorDescriptor<Ts...>{};
+
+    static_assert(old_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
+
+    constexpr auto new_lengths = old_desc.GetLengths().ReorderGivenOld2New(MapLower2Upper{});
+    constexpr auto new_strides = old_desc.GetStrides().ReorderGivenOld2New(MapLower2Upper{});
+
+    return make_native_tensor_descriptor(new_lengths, new_strides);
+}
+
+// reorder a TransformedTensorDescriptor
+template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-    reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper)
+    reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
 {
     static_assert(is_valid_sequence_map<MapLower2Upper>{},
                   "wrong! MapLower2Upper is not a valid map");
 
-    return reorder_tensor_descriptor_impl(
-        LowerTensorDescriptor{},
-        LowerTensorDescriptor::GetLengths(),
-        typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{},
+    constexpr auto low_desc = TransformedTensorDescriptor<Ts...>{};
+
+    static_assert(low_desc.GetNumOfDimension() == MapLower2Upper::Size(), "wrong!");
+
+    return reorder_transformed_tensor_descriptor_impl(
+        low_desc,
+        low_desc.GetLengths(),
+        typename arithmetic_sequence_gen<0, low_desc.GetNumOfDimension(), 1>::type{},
         MapLower2Upper{});
 }
 
@@ -97,7 +121,7 @@ __host__ __device__ constexpr auto
 }
 
 template <typename Lengths, typename Strides>
-__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides)
+__host__ __device__ constexpr bool are_dimensions_unfoldable(Lengths, Strides)
 {
     static_assert(Lengths::Size() == Strides::Size(), "wrong!");
 
@@ -129,7 +153,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
     constexpr auto right = typename arithmetic_sequence_gen<LastUnfoldDim + 1, nDim, 1>::type{};
 
     // sanity-checknfoldable
-    static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
+    static_assert(are_dimensions_unfoldable(desc.GetLengths(middle), desc.GetStrides(middle)),
                   "wrong! not unfoldable");
 
     // unfolded length, stride
@@ -148,30 +172,6 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
     return make_native_tensor_descriptor(new_lengths, new_strides);
 }
 
-#if 0
-// not implemented
-template <typename LowerTensorDescriptor,
-          typename PadDimensionIds,
-          typename LeftPads,
-          typename RightPads>
-__host__ __device__ constexpr auto
-    pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
-{
-    constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
-
-    constexpr auto non_pad_low_dim_ids = xxx;
-
-    return transform_tensor_descriptor(
-        LowerTensorDescriptor{},
-        make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
-                       LeftPads,
-                       RightPads>{})
-            .PushBack(PassThrough<xxxx>...),
-        make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
-        sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
-}
-#endif
-
 // a cluster map 1d index to N-d index
 template <typename Lengths, typename ArrangeOrder>
 struct ClusterDescriptor
@@ -205,169 +205,7 @@ template <typename Lengths,
 __host__ __device__ constexpr auto make_cluster_descriptor(
     Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
 {
-    return ClusterDescriptor<Lengths, ArrangeOrder>{};
-}
-
-template <typename... NativeDimensions>
-__host__ __device__ void
-print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
-{
-    print_tensor_descriptor_impl(s, desc.GetLengths(), desc.GetStrides());
-}
-
-template <typename... Ts>
-__host__ __device__ void print_tensor_descriptor(const char* s,
-                                                 const TransformedTensorDescriptor<Ts...>& desc)
-{
-    print_tensor_descriptor_impl(s, desc.GetLengths());
-}
-
-template <index_t... Lengths, index_t... Strides>
-__host__ __device__ void
-print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>, Sequence<Strides...>)
-{
-    constexpr index_t nDim = sizeof...(Lengths);
-
-    static_assert(nDim > 0 && nDim <= 12, "wrong!");
-
-    static_if<nDim == 1>{}([&](auto) {
-        printf("%s dim %u, lengths {%u}, strides {%u}\n", s, nDim, Lengths..., Strides...);
-    });
-
-    static_if<nDim == 2>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, nDim, Lengths..., Strides...);
-    });
-
-    static_if<nDim == 3>{}([&](auto) {
-        printf(
-            "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, nDim, Lengths..., Strides...);
-    });
-
-    static_if<nDim == 4>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 5>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 6>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 7>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 8>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 9>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u "
-               "%u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 10>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 11>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u "
-               "%u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-
-    static_if<nDim == 12>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u "
-               "%u %u %u %u "
-               "%u %u %u}\n",
-               s,
-               nDim,
-               Lengths...,
-               Strides...);
-    });
-}
-
-template <index_t... Lengths>
-__host__ __device__ void print_tensor_descriptor_impl(const char* s, Sequence<Lengths...>)
-{
-    constexpr index_t nDim = sizeof...(Lengths);
-
-    static_assert(nDim > 0 && nDim <= 12, "wrong!");
-
-    static_if<nDim == 1>{}([&](auto) { printf("%s dim %u, lengths {%u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 2>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 3>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 4>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 5>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 6>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u}, \n", s, nDim, Lengths...); });
-
-    static_if<nDim == 7>{}(
-        [&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u}\n", s, nDim, Lengths...); });
-
-    static_if<nDim == 8>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-
-    static_if<nDim == 9>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-
-    static_if<nDim == 10>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-
-    static_if<nDim == 11>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
-
-    static_if<nDim == 12>{}([&](auto) {
-        printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nDim, Lengths...);
-    });
+    return ClusterDescriptor<Lengths, decltype(order)>{};
 }
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index c922384a9..399a47407 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -193,14 +193,14 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated
         return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
     }
 
-    __device__ static constexpr index_t GetRegisterBufferSize()
+    __device__ static constexpr index_t GetThreadBufferSize()
     {
         return GetRegisterBufferDescriptor().GetElementSpace();
     }
 
     template <typename TData>
-    __device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
-                                          TData* __restrict__ p_buffer) const
+    __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src,
+                                        TData* __restrict__ p_buffer) const
     {
         constexpr auto thread_sub_tensor_lengths = SubLengths{};
 
@@ -255,8 +255,8 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated
     }
 
     template <typename TData>
-    __device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
-                                           TData* __restrict__ p_dst) const
+    __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer,
+                                         TData* __restrict__ p_dst) const
     {
         constexpr auto thread_sub_tensor_lengths = SubLengths{};
 
@@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1_deprecated
     template <typename TData>
     __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
     {
-        TData p_buffer[GetRegisterBufferSize()];
+        TData p_buffer[GetThreadBufferSize()];
 
-        RunLoadRegisterBuffer(p_src, p_buffer);
-        RunStoreRegisterBuffer(p_buffer, p_dst);
+        RunLoadThreadBuffer(p_src, p_buffer);
+        RunStoreThreadBuffer(p_buffer, p_dst);
     }
 
     // When moving the slicing windows along a merged dimension, if the strides of the
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 6a61c2c05..378473e1f 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -6,14 +6,6 @@
 #include "tensor_descriptor_helper.hpp"
 #include "tensor_coordinate.hpp"
 
-#ifndef CK_USE_AMD_INTRINSIC
-#define CK_USE_AMD_INTRINSIC 1
-#endif
-
-#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
-#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1
-#endif
-
 namespace ck {
 
 // This version use multi-index transformation
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index 0310addd3..c70929f3f 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -80,11 +80,11 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
         mDstSliceOrigin = dst_slice_origin;
     }
 
-    template <typename TData>
-    __device__ void Run(const TData* p_src, TData* p_dst) const
+    template <class SrcData, class DstData>
+    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
     {
-        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
-        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
 
         constexpr auto vector_access_dim = Number<VectorAccessDim>{};
 
@@ -96,46 +96,6 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
         constexpr auto long_vector_access_lengths = SliceLengths::Modify(
             vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
 
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
-        static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
-            auto long_vector_access_id) {
-
-            // data id w.r.t slicing-window
-            constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
-                vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
-
-            // buffer to hold a long-vector
-            TData p_long_vector[long_vector_size];
-
-            // load data from src to the long-vector buffer
-            static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) {
-                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                    vector_access_dim, i * src_data_per_access);
-
-                const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
-                    mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
-                constexpr index_t buffer_offset = i * src_data_per_access;
-
-                *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
-                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
-            });
-
-            // store data from the long-vector buffer to dst
-            static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) {
-                constexpr auto scalar_id = typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
-                    vector_access_dim, i * dst_data_per_access);
-
-                constexpr index_t buffer_offset = i * dst_data_per_access;
-
-                const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
-                    mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
-                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                    *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
-            });
-        });
-#else
         ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
             [&](auto long_vector_access_id) {
 
@@ -145,7 +105,8 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
                     long_vector_size * long_vector_access_id[vector_access_dim];
 
                 // buffer to hold a long-vector
-                TData p_long_vector[long_vector_size];
+                SrcData p_src_long_vector[long_vector_size];
+                DstData p_dst_long_vector[long_vector_size];
 
                 // load data from src to the long-vector buffer
                 for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
@@ -158,10 +119,16 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
 
                     const index_t buffer_offset = i * src_data_per_access;
 
-                    *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                    *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                         *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
                 }
 
+                // type conversion
+                for(index_t i = 0; i < long_vector_size; ++i)
+                {
+                    p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
+                }
+
                 // store data from the long-vector buffer to dst
                 for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
                 {
@@ -174,10 +141,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
                         mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
 
                     *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
+                        *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
                 }
             });
-#endif
     }
 
     private:
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index c764b27d2..006659710 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -77,6 +77,35 @@ __device__ void __outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0
                    "1"(c1)); // 3rd Src Acc registers for 2 half2 registers
 }
 
+// outer-product: c[i,j] += inner_product(a[i], b[j])
+__device__ void __outer_product_1x4(half2_t a,
+                                    half2_t b0,
+                                    half2_t b1,
+                                    half2_t b2,
+                                    half2_t b3,
+                                    float& c0,
+                                    float& c1,
+                                    float& c2,
+                                    float& c3)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %5  %0\n \
+            v_dot2_f32_f16 %1, %4, %6  %1\n \
+            v_dot2_f32_f16 %2, %4, %7  %2\n \
+            v_dot2_f32_f16 %3, %4, %8  %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) // Dest registers
+                 : "v"(a),                                // 1st Src register for 1 half2 registers
+                   "v"(b0),                               // 2nd Src register
+                   "v"(b1),
+                   "v"(b2),
+                   "v"(b3),
+                   "0"(c0), // 3rd Src register
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3));
+}
+
 // outer-product: c[i,j] += inner_product(a[i], b[j])
 __device__ void __outer_product_1x4(half4_t a,
                                     half4_t b0,
diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp
index a5bbd8782..d161edd98 100644
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp
@@ -70,9 +70,9 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
@@ -92,9 +92,9 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_load(
         src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
@@ -118,9 +118,9 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
@@ -140,9 +140,9 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_loadx2(
         src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
@@ -166,9 +166,9 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
@@ -188,9 +188,9 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int*>(&src_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_loadx4(
         src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
@@ -213,9 +213,9 @@ __device__ void __buffer_store<float, 1>(const float& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
@@ -233,9 +233,9 @@ __device__ void __buffer_store<float, 1>(const float& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     __llvm_amdgcn_buffer_store(
         src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
@@ -256,9 +256,9 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
@@ -276,9 +276,9 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     __llvm_amdgcn_buffer_storex2(
         src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
@@ -299,9 +299,9 @@ __device__ void __buffer_store<float, 4>(const float4_t& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
@@ -319,9 +319,9 @@ __device__ void __buffer_store<float, 4>(const float4_t& src,
     // fill in byte 0 - 1
     *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
+    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
 
     __llvm_amdgcn_buffer_storex4(
         src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
index 1da362b81..beb9e083b 100644
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -20,15 +20,6 @@
 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
 #endif
 
-// AMD XDLOPS
-#ifndef CK_USE_AMD_XDLOPS
-#define CK_USE_AMD_XDLOPS 1
-#endif
-
-#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
-#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
-#endif
-
 // AMD llvm intrinsic
 #ifndef CK_USE_AMD_INTRINSIC
 #define CK_USE_AMD_INTRINSIC 1
@@ -38,10 +29,18 @@
 #define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1
 #endif
 
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 1
+#endif
+
+#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
+#endif
+
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 
diff --git a/composable_kernel/include/utility/float_type.hpp b/composable_kernel/include/utility/float_type.hpp
new file mode 100644
index 000000000..06368305d
--- /dev/null
+++ b/composable_kernel/include/utility/float_type.hpp
@@ -0,0 +1,108 @@
+#ifndef CK_FLOAT_TYPE_AMD_HPP
+#define CK_FLOAT_TYPE_AMD_HPP
+
+namespace ck {
+
+// For some reason, HIP compiler need this definition to generate optimal ISA
+// float
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef float float32_t __attribute__((ext_vector_type(32)));
+
+// float16
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+
+// bfloat16
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+
+// data type conversion
+template <typename T>
+struct type_convert
+{
+    template <typename X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
+    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(ushort4_t a, ushort4_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/print_array.hpp b/composable_kernel/include/utility/print_array.hpp
new file mode 100644
index 000000000..34769af2f
--- /dev/null
+++ b/composable_kernel/include/utility/print_array.hpp
@@ -0,0 +1,177 @@
+#ifndef CK_ARRAY_HELPER_HPP
+#define CK_ARRAY_HELPER_HPP
+
+#include "array.hpp"
+
+namespace ck {
+
+template <index_t NSize>
+__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
+{
+    constexpr index_t nsize = a.GetSize();
+
+    static_assert(nsize > 0 && nsize <= 10, "wrong!");
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
+
+    static_if<nsize == 3>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
+
+    static_if<nsize == 4>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
+
+    static_if<nsize == 5>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
+    });
+
+    static_if<nsize == 6>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
+    });
+
+    static_if<nsize == 7>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6]);
+    });
+
+    static_if<nsize == 8>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7]);
+    });
+
+    static_if<nsize == 9>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8]);
+    });
+
+    static_if<nsize == 10>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8],
+               a[9]);
+    });
+}
+
+template <index_t NSize>
+__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
+{
+    constexpr index_t nsize = a.GetSize();
+
+    static_assert(nsize > 0 && nsize <= 10, "wrong!");
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
+
+    static_if<nsize == 3>{}(
+        [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
+
+    static_if<nsize == 4>{}(
+        [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
+
+    static_if<nsize == 5>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
+    });
+
+    static_if<nsize == 6>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
+    });
+
+    static_if<nsize == 7>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6]);
+    });
+
+    static_if<nsize == 8>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7]);
+    });
+
+    static_if<nsize == 9>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8]);
+    });
+
+    static_if<nsize == 10>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8],
+               a[9]);
+    });
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/print_sequence.hpp b/composable_kernel/include/utility/print_sequence.hpp
new file mode 100644
index 000000000..71abfea1f
--- /dev/null
+++ b/composable_kernel/include/utility/print_sequence.hpp
@@ -0,0 +1,46 @@
+#ifndef CK_SEQUENCE_HELPER_HPP
+#define CK_SEQUENCE_HELPER_HPP
+
+#include "sequence.hpp"
+
+namespace ck {
+
+template <index_t... Xs>
+__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
+{
+    constexpr index_t nsize = Sequence<Xs...>::Size();
+
+    static_assert(nsize <= 10, "wrong!");
+
+    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 5>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 6>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 7>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 8>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 9>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 10>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+}
+
+} // namespace ck
+
+#endif
diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp
index 01c6539b2..e9b3fe36d 100644
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
@@ -9,6 +9,10 @@ namespace ck {
 template <class T, index_t N>
 struct vector_type
 {
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
 };
 
 template <>
@@ -29,7 +33,7 @@ struct vector_type<float, 2>
 {
     using MemoryType = float2_t;
 
-    union Data
+    union DataType
     {
         MemoryType vector;
         float scalar[2];
@@ -44,7 +48,7 @@ struct vector_type<float, 2>
 
     __host__ __device__ static MemoryType Pack(float s0, float s1)
     {
-        Data data;
+        DataType data;
         data.scalar[0] = s0;
         data.scalar[1] = s1;
         return data.vector;
@@ -56,6 +60,8 @@ struct vector_type<float, 4>
 {
     using MemoryType = float4_t;
 
+    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+
     template <index_t I>
     __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
     {
@@ -65,23 +71,142 @@ struct vector_type<float, 4>
 };
 
 template <>
-struct vector_type<const float, 1>
+struct vector_type<half, 1>
 {
-    using MemoryType = const float;
+    using MemoryType = half;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
 };
 
 template <>
-struct vector_type<const float, 2>
+struct vector_type<half, 2>
 {
-    using MemoryType = const float2_t;
+    using MemoryType = half2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
 
 template <>
-struct vector_type<const float, 4>
+struct vector_type<half, 4>
 {
-    using MemoryType = const float4_t;
+    using MemoryType = half4_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[4];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
 };
 
+template <>
+struct vector_type<ushort, 1>
+{
+    using MemoryType = ushort;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<ushort, 2>
+{
+    using MemoryType = ushort2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<ushort, 4>
+{
+    using MemoryType = ushort4_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[4];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
 } // namespace ck
 
 #endif
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index d361db801..ccff9e725 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -3,6 +3,7 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
+#include "convolution_common.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
 
 template <typename T,
@@ -177,48 +178,46 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
 
     constexpr auto gridwise_conv =
-#if 0
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
-#else
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
-#endif
-        <GridSize,
-         BlockSize,
-         T,
-         decltype(in_nchw_desc),
-         decltype(wei_kcyx_desc),
-         decltype(out_nkhw_desc),
-         ConvStrides,
-         ConvDilations,
-         LeftPads,
-         RightPads,
-         BPerBlock,
-         KPerBlock,
-         EPerBlock,
-         GemmNRepeat,
-         GemmMPerThreadSubC,
-         GemmNPerThreadSubC,
-         GemmMLevel0Cluster,
-         GemmNLevel0Cluster,
-         GemmMLevel1Cluster,
-         GemmNLevel1Cluster,
-         GemmKPerThreadLoop,
-         GemmDataPerReadA,
-         GemmDataPerReadB,
-         InBlockCopySubLengths_E_N1_B_N2,
-         InBlockCopyClusterLengths_E_N1_B_N2,
-         InBlockCopyThreadClusterArrangeOrder,
-         InBlockCopySrcAccessOrder,
-         InBlockCopyDstAccessOrder,
-         InBlockCopySrcDataPerRead_B,
-         InBlockCopyDstDataPerWrite_N2,
-         WeiBlockCopySubLengths_E_K,
-         WeiBlockCopyClusterLengths_E_K,
-         WeiBlockCopyThreadClusterArrangeOrder,
-         WeiBlockCopySrcAccessOrder,
-         WeiBlockCopyDstAccessOrder,
-         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K>{};
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            T,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            LeftPads,
+            RightPads,
+            ConvolutionDirection::Forward,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmNRepeat,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K>{};
 
     for(index_t i = 0; i < nrepeat; ++i)
     {
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 006d13bc6..957134842 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
     // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
     // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
     constexpr index_t N  = 128;
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -438,7 +438,7 @@ int main(int argc, char* argv[])
 #elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
+#elif 1
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                     in_nchw,
                                                                     wei_kcyx_desc,

From 906b384018d980cb13257852f03614404dcf1529 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 7 Oct 2019 11:04:58 -0500
Subject: [PATCH 12/20] refactor for nvidia build

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  88 ++++----
 ...kcyx_nkhw_lds_double_buffer_deprecated.hpp |   1 -
 .../ConstantMatrixDescriptor.hpp              |   2 +-
 .../tensor_description/tensor_coordinate.hpp  |   4 +-
 .../tensor_coordinate_deprecated.hpp          |   4 +-
 .../tensor_descriptor_helper.hpp              |  12 +-
 .../include/utility/common_header.hpp         |   3 +-
 .../include/utility/float_type.amd.hpp.in     | 202 +++++++++++++++++
 .../include/utility/float_type.hpp            | 108 ---------
 .../include/utility/float_type.nvidia.hpp.in  | 106 +++++++++
 .../include/utility/vector_type.hpp           | 212 ------------------
 driver/src/driver.cpp                         |   2 +-
 12 files changed, 365 insertions(+), 379 deletions(-)
 delete mode 100644 composable_kernel/include/utility/float_type.hpp
 delete mode 100644 composable_kernel/include/utility/vector_type.hpp

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index f3a98d773..09d275913 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -12,6 +12,49 @@
 
 namespace ck {
 
+template <ConvolutionDirection>
+struct make_wei_e_k_global_desc_v4r1;
+
+template <>
+struct make_wei_e_k_global_desc_v4r1<ConvolutionDirection::Forward>
+{
+    template <typename WeiDesc>
+    __device__ constexpr auto operator()(WeiDesc) const
+    {
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I3 = Number<3>{};
+
+        return reorder_tensor_descriptor_given_upper2lower(
+            unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{});
+    }
+};
+
+template <>
+struct make_wei_e_k_global_desc_v4r1<ConvolutionDirection::BackwardWeight>
+{
+    template <typename WeiDesc>
+    __device__ constexpr auto operator()(WeiDesc) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto wei_k_c_y_x_global_desc = WeiDesc{};
+
+        constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0);
+        constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1);
+        constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
+        constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+        return transform_tensor_descriptor(
+            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3),
+            make_tuple(Merge<Sequence<C, Y * X>>{}, PassThrough<K>{}),
+            make_tuple(Sequence<1, 2>{}, Sequence<0>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+};
+
 template <index_t GridSize,
           index_t BlockSize,
           typename Float,
@@ -53,49 +96,6 @@ template <index_t GridSize,
           index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 {
-    template <ConvolutionDirection>
-    struct make_wei_e_k_global_desc;
-
-    template <>
-    struct make_wei_e_k_global_desc<ConvolutionDirection::Forward>
-    {
-        template <typename WeiDesc>
-        __device__ constexpr auto operator()(WeiDesc) const
-        {
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I3 = Number<3>{};
-
-            return reorder_tensor_descriptor_given_upper2lower(
-                unfold_tensor_descriptor(WeiDesc{}, I1, I3), Sequence<1, 0>{});
-        }
-    };
-
-    template <>
-    struct make_wei_e_k_global_desc<ConvolutionDirection::BackwardWeight>
-    {
-        template <typename WeiDesc>
-        __device__ constexpr auto operator()(WeiDesc) const
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-
-            constexpr auto wei_k_c_y_x_global_desc = WeiDesc{};
-
-            constexpr index_t K = wei_k_c_y_x_global_desc.GetLength(I0);
-            constexpr index_t C = wei_k_c_y_x_global_desc.GetLength(I1);
-            constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
-            constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
-
-            return transform_tensor_descriptor(
-                unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I2, I3),
-                make_tuple(Merge<Sequence<C, Y * X>>{}, PassThrough<K>{}),
-                make_tuple(Sequence<1, 2>{}, Sequence<0>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-    };
-
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
@@ -234,7 +234,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         //     It is constructed differently, depending on whether forward or backward weight
         //       convolution
         constexpr auto wei_e_k_global_desc =
-            make_wei_e_k_global_desc<ConvDirection>{}(wei_k_c_y_x_global_desc);
+            make_wei_e_k_global_desc_v4r1<ConvDirection>{}(wei_k_c_y_x_global_desc);
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
index 267e8e0a6..db92631a3 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
@@ -67,7 +67,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
-        constexpr auto I5 = Number<5>{};
 
         constexpr auto True = integral_constant<bool, true>{};
 
diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
index 0ebd9dc4a..e2a5836ed 100644
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
 
 template <typename... Ts>
 __host__ __device__ constexpr auto
-    make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
+make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
 {
     using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
     static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 5ce5bc700..4b3a60c67 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -215,7 +215,7 @@ struct TensorCoordinate
     private:
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -223,7 +223,7 @@ struct TensorCoordinate
 
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
         return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
index 69659445a..da02abdd5 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
+    MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
     {
         return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
+    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
     {
         return MergedTensorCoordinate_deprecated<
             ConstantMergedTensorDescriptor_deprecated<Ts...>>();
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index d7ef38672..1597e4c57 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor,
           index_t... LowerDimensionIds,
           index_t... UpperDimensionIds>
 __host__ __device__ constexpr auto
-    reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
-                                               Sequence<LowerLengths...>,
-                                               Sequence<LowerDimensionIds...>,
-                                               Sequence<UpperDimensionIds...>)
+reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
+                                           Sequence<LowerLengths...>,
+                                           Sequence<LowerDimensionIds...>,
+                                           Sequence<UpperDimensionIds...>)
 {
     return TransformedTensorDescriptor<LowerTensorDescriptor,
                                        Tuple<PassThrough<LowerLengths>...>,
@@ -78,7 +78,7 @@ __host__ __device__ constexpr auto
 // reorder a NativeTensorDescriptor
 template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-    reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
+reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
 {
     static_assert(is_valid_sequence_map<MapLower2Upper>{},
                   "wrong! MapLower2Upper is not a valid map");
@@ -96,7 +96,7 @@ __host__ __device__ constexpr auto
 // reorder a TransformedTensorDescriptor
 template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-    reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
+reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
 {
     static_assert(is_valid_sequence_map<MapLower2Upper>{},
                   "wrong! MapLower2Upper is not a valid map");
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 441eecae9..20584c335 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -2,14 +2,13 @@
 #define CK_COMMON_HEADER_HPP
 
 #include "config.hpp"
-#include "float_type.hpp"
 #include "utility.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
+#include "float_type.hpp"
 #include "type.hpp"
 #include "tuple.hpp"
 #include "math.hpp"
-#include "vector_type.hpp"
 #include "sequence.hpp"
 #include "sequence_helper.hpp"
 #include "array.hpp"
diff --git a/composable_kernel/include/utility/float_type.amd.hpp.in b/composable_kernel/include/utility/float_type.amd.hpp.in
index 06368305d..537d17daf 100644
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -17,6 +17,208 @@ typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
 typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
 typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
 
+template <class T, index_t N>
+struct vector_type
+{
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using MemoryType = float;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using MemoryType = float2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        float scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using MemoryType = float4_t;
+
+    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<half, 1>
+{
+    using MemoryType = half;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using MemoryType = half2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<half, 4>
+{
+    using MemoryType = half4_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[4];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<ushort, 1>
+{
+    using MemoryType = ushort;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<ushort, 2>
+{
+    using MemoryType = ushort2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<ushort, 4>
+{
+    using MemoryType = ushort4_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[4];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+
 // data type conversion
 template <typename T>
 struct type_convert
diff --git a/composable_kernel/include/utility/float_type.hpp b/composable_kernel/include/utility/float_type.hpp
deleted file mode 100644
index 06368305d..000000000
--- a/composable_kernel/include/utility/float_type.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef CK_FLOAT_TYPE_AMD_HPP
-#define CK_FLOAT_TYPE_AMD_HPP
-
-namespace ck {
-
-// For some reason, HIP compiler need this definition to generate optimal ISA
-// float
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-typedef float float32_t __attribute__((ext_vector_type(32)));
-
-// float16
-typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
-typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
-
-// bfloat16
-typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
-typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
-
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(X x) const
-    {
-        return static_cast<T>(x);
-    }
-};
-
-template <>
-template <>
-__device__ float type_convert<float>::operator()<ushort>(ushort x) const
-{
-    return bfloat16_to_float(x);
-}
-
-template <>
-template <>
-__device__ ushort type_convert<ushort>::operator()<float>(float x) const
-{
-    return float_to_bfloat16(x);
-}
-
-template <typename T>
-struct inner_product_with_conversion
-{
-    static constexpr auto convert = type_convert<T>();
-
-    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
-
-    __device__ T operator()(half2_t a, half2_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-
-        return acc;
-    }
-
-    __device__ T operator()(half4_t a, half4_t b) const
-    {
-        const half* p_a_half = reinterpret_cast<const half*>(&a);
-        const half* p_b_half = reinterpret_cast<const half*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-        return acc;
-    }
-
-    __device__ T operator()(ushort2_t a, ushort2_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-
-        return acc;
-    }
-
-    __device__ T operator()(ushort4_t a, ushort4_t b) const
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-        return acc;
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/composable_kernel/include/utility/float_type.nvidia.hpp.in b/composable_kernel/include/utility/float_type.nvidia.hpp.in
index fbb93a437..8be8c704a 100644
--- a/composable_kernel/include/utility/float_type.nvidia.hpp.in
+++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in
@@ -1,6 +1,8 @@
 #ifndef CK_FLOAT_TYPE_NVIDIA_HPP
 #define CK_FLOAT_TYPE_NVIDIA_HPP
 
+#include "number.hpp"
+
 namespace ck {
 
 // For some reason, CUDA need this definition, otherwise
@@ -14,6 +16,110 @@ using float4_t = float4;
 // float16
 using half2_t = half2;
 
+template <class T, index_t N>
+struct vector_type
+{
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using MemoryType = float;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using MemoryType = float2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        float scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using MemoryType = float4_t;
+
+    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<half, 1>
+{
+    using MemoryType = half;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using MemoryType = half2_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[2];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
 // data type conversion
 template <typename T>
 struct type_convert
diff --git a/composable_kernel/include/utility/vector_type.hpp b/composable_kernel/include/utility/vector_type.hpp
deleted file mode 100644
index e9b3fe36d..000000000
--- a/composable_kernel/include/utility/vector_type.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-#ifndef CK_VECTOR_TYPE_HPP
-#define CK_VECTOR_TYPE_HPP
-
-#include "config.hpp"
-#include "integral_constant.hpp"
-
-namespace ck {
-
-template <class T, index_t N>
-struct vector_type
-{
-    typedef struct
-    {
-        T scalar[N];
-    } MemoryType;
-};
-
-template <>
-struct vector_type<float, 1>
-{
-    using MemoryType = float;
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
-    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<float*>(&v) + I) = s;
-    }
-};
-
-template <>
-struct vector_type<float, 2>
-{
-    using MemoryType = float2_t;
-
-    union DataType
-    {
-        MemoryType vector;
-        float scalar[2];
-    };
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
-    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<float*>(&v) + I) = s;
-    }
-
-    __host__ __device__ static MemoryType Pack(float s0, float s1)
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<float, 4>
-{
-    using MemoryType = float4_t;
-
-    __host__ __device__ static constexpr index_t GetSize() { return 4; }
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
-    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<float*>(&v) + I) = s;
-    }
-};
-
-template <>
-struct vector_type<half, 1>
-{
-    using MemoryType = half;
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
-    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
-    }
-};
-
-template <>
-struct vector_type<half, 2>
-{
-    using MemoryType = half2_t;
-
-    union DataType
-    {
-        MemoryType vector;
-        half scalar[2];
-    };
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
-    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
-    }
-
-    __host__ __device__ static MemoryType Pack(half s0, half s1)
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<half, 4>
-{
-    using MemoryType = half4_t;
-
-    union DataType
-    {
-        MemoryType vector;
-        half scalar[4];
-    };
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
-    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<half*>(&v) + I) = s;
-    }
-
-    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<ushort, 1>
-{
-    using MemoryType = ushort;
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
-    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
-    }
-};
-
-template <>
-struct vector_type<ushort, 2>
-{
-    using MemoryType = ushort2_t;
-
-    union DataType
-    {
-        MemoryType vector;
-        ushort scalar[2];
-    };
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
-    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
-    }
-
-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<ushort, 4>
-{
-    using MemoryType = ushort4_t;
-
-    union DataType
-    {
-        MemoryType vector;
-        ushort scalar[4];
-    };
-
-    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
-    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
-    }
-
-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
-    }
-};
-} // namespace ck
-
-#endif
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 957134842..1a819e112 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -438,7 +438,7 @@ int main(int argc, char* argv[])
 #elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                     in_nchw,
                                                                     wei_kcyx_desc,

From 18a2c5cb87b92103281e660220f3c581c8b78145 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 7 Oct 2019 15:37:33 -0500
Subject: [PATCH 13/20] refactor

---
 .../ConstantMatrixDescriptor.hpp              |   2 +-
 .../tensor_description/tensor_coordinate.hpp  |   4 +-
 .../tensor_coordinate_deprecated.hpp          |   4 +-
 .../tensor_descriptor_helper.hpp              |  12 +-
 .../include/utility/amd_inline_asm.hpp        |   2 +-
 .../include/utility/amd_intrinsic.hpp         | 184 +++++++++++-------
 .../include/utility/array_helper.hpp          | 177 -----------------
 .../include/utility/common_header.hpp         |   2 -
 .../include/utility/config.amd.hpp.in         |   2 +-
 .../include/utility/print_array.hpp           |   4 +-
 .../include/utility/print_sequence.hpp        |   4 +-
 .../include/utility/sequence_helper.hpp       |  46 -----
 driver/src/driver.cpp                         |   2 +
 13 files changed, 129 insertions(+), 316 deletions(-)
 delete mode 100644 composable_kernel/include/utility/array_helper.hpp
 delete mode 100644 composable_kernel/include/utility/sequence_helper.hpp

diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
index e2a5836ed..0ebd9dc4a 100644
--- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
 
 template <typename... Ts>
 __host__ __device__ constexpr auto
-make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
+    make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated<Ts...>)
 {
     using TDesc = ConstantTensorDescriptor_deprecated<Ts...>;
     static_assert(TDesc::GetNumOfDimension() == 2, "wrong");
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 4b3a60c67..5ce5bc700 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -215,7 +215,7 @@ struct TensorCoordinate
     private:
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -223,7 +223,7 @@ struct TensorCoordinate
 
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
         return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
index da02abdd5..69659445a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
     {
         return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
     {
         return MergedTensorCoordinate_deprecated<
             ConstantMergedTensorDescriptor_deprecated<Ts...>>();
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 1597e4c57..d7ef38672 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -64,10 +64,10 @@ template <typename LowerTensorDescriptor,
           index_t... LowerDimensionIds,
           index_t... UpperDimensionIds>
 __host__ __device__ constexpr auto
-reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
-                                           Sequence<LowerLengths...>,
-                                           Sequence<LowerDimensionIds...>,
-                                           Sequence<UpperDimensionIds...>)
+    reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
+                                               Sequence<LowerLengths...>,
+                                               Sequence<LowerDimensionIds...>,
+                                               Sequence<UpperDimensionIds...>)
 {
     return TransformedTensorDescriptor<LowerTensorDescriptor,
                                        Tuple<PassThrough<LowerLengths>...>,
@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
 // reorder a NativeTensorDescriptor
 template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
+    reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLower2Upper)
 {
     static_assert(is_valid_sequence_map<MapLower2Upper>{},
                   "wrong! MapLower2Upper is not a valid map");
@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo
 // reorder a TransformedTensorDescriptor
 template <typename... Ts, typename MapLower2Upper>
 __host__ __device__ constexpr auto
-reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
+    reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor<Ts...>, MapLower2Upper)
 {
     static_assert(is_valid_sequence_map<MapLower2Upper>{},
                   "wrong! MapLower2Upper is not a valid map");
diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp
index 006659710..28eaf1f44 100644
--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -1,7 +1,7 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
 
-#include "vector_type.hpp"
+#include "float_type.hpp"
 
 namespace ck {
 
diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp
index d161edd98..2575cbc40 100644
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp
@@ -1,10 +1,19 @@
 #ifndef CK_AMD_INTRINSIC_HPP
 #define CK_AMD_INTRINSIC_HPP
 
-#include "vector_type.hpp"
+#include "float_type.hpp"
 
 namespace ck {
 
+// for buffer_load and buffer_store
+template <typename T>
+union BufferLoadStoreDwordConfig
+{
+    int32x4_t data;
+    T* address[2];
+    int32_t range[4];
+};
+
 __device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
                                            index_t vindex,
                                            index_t offset,
@@ -66,20 +75,22 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
-    asm volatile("\n \
+    asm volatile(
+        "\n \
     buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
 
     return dst;
 #else
@@ -88,16 +99,17 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_load(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
 
     return dst;
 #endif
@@ -114,20 +126,22 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
-    asm volatile("\n \
+    asm volatile(
+        "\n \
     buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
 
     return dst;
 #else
@@ -136,16 +150,17 @@ __device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_loadx2(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
 
     return dst;
 #endif
@@ -162,38 +177,41 @@ __device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
-    asm volatile("\n \
+    asm volatile(
+        "\n \
     buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
     s_waitcnt 0 \n \
     "
-                 : "=v"(dst)
-                 : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
 
     return dst;
-#elif 1
+#else
     float4_t dst;
 
     index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
     index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
 
-    int32x4_t src_block_setting{0};
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&src_block_setting)[2] = -1;
+    src_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&src_block_setting)[3] = 0x00027000;
+    src_block_config.range[3] = 0x00027000;
 
     dst = __llvm_amdgcn_buffer_loadx4(
-        src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
 
     return dst;
 #endif
@@ -209,19 +227,20 @@ __device__ void __buffer_store<float, 1>(const float& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
+    dst_block_config.range[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
     "
                  :
-                 : "s"(dst_block_setting),
+                 : "s"(dst_block_config.data),
                    "v"(src),
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
@@ -229,16 +248,21 @@ __device__ void __buffer_store<float, 1>(const float& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_store(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+    dst_block_config.range[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_store(src,
+                               dst_block_config.data,
+                               0,
+                               dst_thread_addr_offset + dst_const_addr_offset,
+                               false,
+                               false);
 #endif
 }
 
@@ -252,19 +276,20 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
+    dst_block_config.range[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
     "
                  :
-                 : "s"(dst_block_setting),
+                 : "s"(dst_block_config.data),
                    "v"(src),
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
@@ -272,16 +297,21 @@ __device__ void __buffer_store<float, 2>(const float2_t& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex2(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+    dst_block_config.range[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_storex2(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
 #endif
 }
 
@@ -295,19 +325,20 @@ __device__ void __buffer_store<float, 4>(const float4_t& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
+    dst_block_config.range[3] = 0x00027000;
 
     asm volatile("\n \
     buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
     "
                  :
-                 : "s"(dst_block_setting),
+                 : "s"(dst_block_config.data),
                    "v"(src),
                    "v"(dst_thread_addr_offset),
                    "s"(dst_const_addr_offset));
@@ -315,16 +346,21 @@ __device__ void __buffer_store<float, 4>(const float4_t& src,
     index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
     index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
 
-    int32x4_t dst_block_setting{0};
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
     // fill in byte 0 - 1
-    *reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
+    dst_block_config.address[0] = p_dst_block;
     // fill in byte 2
-    reinterpret_cast<int32_t*>(&dst_block_setting)[2] = -1;
+    dst_block_config.range[2] = -1;
     // fill in byte 3
-    reinterpret_cast<int32_t*>(&dst_block_setting)[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex4(
-        src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
+    dst_block_config.range[3] = 0x00027000;
+
+    __llvm_amdgcn_buffer_storex4(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
 #endif
 }
 
diff --git a/composable_kernel/include/utility/array_helper.hpp b/composable_kernel/include/utility/array_helper.hpp
deleted file mode 100644
index 34769af2f..000000000
--- a/composable_kernel/include/utility/array_helper.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-#ifndef CK_ARRAY_HELPER_HPP
-#define CK_ARRAY_HELPER_HPP
-
-#include "array.hpp"
-
-namespace ck {
-
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
-
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
-
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
-
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
-
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-
-} // namespace ck
-#endif
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index 20584c335..e01ec6efc 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -10,9 +10,7 @@
 #include "tuple.hpp"
 #include "math.hpp"
 #include "sequence.hpp"
-#include "sequence_helper.hpp"
 #include "array.hpp"
-#include "array_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
index beb9e083b..7800f5293 100644
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -31,7 +31,7 @@
 
 // AMD XDLOPS
 #ifndef CK_USE_AMD_XDLOPS
-#define CK_USE_AMD_XDLOPS 1
+#define CK_USE_AMD_XDLOPS 0
 #endif
 
 #ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
diff --git a/composable_kernel/include/utility/print_array.hpp b/composable_kernel/include/utility/print_array.hpp
index 34769af2f..b53bbb90f 100644
--- a/composable_kernel/include/utility/print_array.hpp
+++ b/composable_kernel/include/utility/print_array.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_ARRAY_HELPER_HPP
-#define CK_ARRAY_HELPER_HPP
+#ifndef CK_PRINT_ARRAY_HPP
+#define CK_PRINT_ARRAY_HPP
 
 #include "array.hpp"
 
diff --git a/composable_kernel/include/utility/print_sequence.hpp b/composable_kernel/include/utility/print_sequence.hpp
index 71abfea1f..463f9d097 100644
--- a/composable_kernel/include/utility/print_sequence.hpp
+++ b/composable_kernel/include/utility/print_sequence.hpp
@@ -1,5 +1,5 @@
-#ifndef CK_SEQUENCE_HELPER_HPP
-#define CK_SEQUENCE_HELPER_HPP
+#ifndef CK_PRINT_SEQUENCE_HPP
+#define CK_PRINT_SEQUENCE_HPP
 
 #include "sequence.hpp"
 
diff --git a/composable_kernel/include/utility/sequence_helper.hpp b/composable_kernel/include/utility/sequence_helper.hpp
deleted file mode 100644
index 71abfea1f..000000000
--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef CK_SEQUENCE_HELPER_HPP
-#define CK_SEQUENCE_HELPER_HPP
-
-#include "sequence.hpp"
-
-namespace ck {
-
-template <index_t... Xs>
-__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
-{
-    constexpr index_t nsize = Sequence<Xs...>::Size();
-
-    static_assert(nsize <= 10, "wrong!");
-
-    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 5>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 6>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 7>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 8>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 9>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 10>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-}
-
-} // namespace ck
-
-#endif
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 1a819e112..3a0eedc64 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -5,6 +5,8 @@
 #include <stdlib.h>
 #include "config.hpp"
 #include "ConstantTensorDescriptor_deprecated.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
 #include "device.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"

From 093306c16292a09b2222c6a588fa9fe64a1be257 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 9 Oct 2019 20:32:52 -0500
Subject: [PATCH 14/20] bring in more miopen changes

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  89 +++---
 .../ConstantTensorDescriptor_deprecated.hpp   |  10 +-
 .../tensor_description/tensor_coordinate.hpp  |  21 +-
 .../tensor_description/tensor_descriptor.hpp  |  22 +-
 .../blockwise_generic_tensor_slice_copy.hpp   |  56 ++--
 .../threadwise_generic_tensor_slice_copy.hpp  |  31 +-
 ...e_generic_tensor_slice_copy_deprecated.hpp |   4 +-
 .../include/utility/amd_buffer_addressing.hpp | 284 ++++++++++++++++++
 .../include/utility/common_header.hpp         |   4 +-
 .../include/utility/config.amd.hpp.in         |  14 +-
 10 files changed, 429 insertions(+), 106 deletions(-)
 create mode 100644 composable_kernel/include/utility/amd_buffer_addressing.hpp

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 09d275913..18c7f9a39 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -158,7 +158,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                       "be violated");
 
         // divide block work by [K, B]
-        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0,
+        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0,
                       "wrong! cannot divide work evenly among block");
 
         constexpr index_t KBlockWork = K / KPerBlock;
@@ -173,7 +173,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
 
         // input tensor
-        //     global memory
+        //     global tensor in global memory
         constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
             in_n_c_hi_wi_global_desc,
             make_tuple(
@@ -190,6 +190,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
             make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
 
+        //     global tensor in global memory, src of blockwise copy
         constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor(
             in_n0_n1_n2_c_y_ho_x_wo_global_desc,
             make_tuple(Merge<Sequence<C, Y, X>>{},
@@ -199,7 +200,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}),
             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
 
-        //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
+        //     block tensor in LDS memory, dst of blockwise copy
         //     be careful of LDS alignment
         constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned(
             Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
@@ -210,9 +211,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                       "GemmDataPerReadB alignment requirement is not satisfied");
 
-        // input blockwise copy
-        //     slice a merged tensor, reorder and copy to a normal tensor
-        //     this copy operator already has blockwise offset built-in
+        // input tensor blockwise copy
         auto blockwise_in_copy =
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(in_e_n1_b_n2_global_desc),
@@ -230,13 +229,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
-        //     Tensor descriptor in device memory, src of blockwise copy
+        //     global tensor in global memory, src of blockwise copy
         //     It is constructed differently, depending on whether forward or backward weight
         //       convolution
         constexpr auto wei_e_k_global_desc =
             make_wei_e_k_global_desc_v4r1<ConvDirection>{}(wei_k_c_y_x_global_desc);
 
-        //     tensor descriptor in LDS, dst of blockwise copy
+        //     block tensor in LDS memory, dst of blockwise copy
         //     be careful of LDS alignment
         constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
             Sequence<EPerBlock, KPerBlock>{},
@@ -248,9 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0,
                       "GemmDataPerReadA alignment requirement is not satisfied");
 
-        // operator for blockwise copy of weight into LDS
-        //     slice a tensor, and copy it into another tensor
-        //     this copy operator already have blockwise offset built-in
+        // weight tensor blockwise copy
         auto blockwise_wei_copy =
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(wei_e_k_global_desc),
@@ -268,11 +265,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 {0, k_block_data_on_global}, {0, 0});
 
         // GEMM definition
-        // c_mtx += transpose(a_mtx) * b_mtx
+        //   c_mtx += transpose(a_mtx) * b_mtx
         //     a_mtx[EPerBlock, KPerBlock] is in LDS
         //     b_mtx[EPerBlocl, N1 * BPerBlock * N2] is in LDS
         //     c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
-        //     register
+        //       register
         constexpr auto a_e_k_block_mtx_desc = make_ConstantMatrixDescriptor(wei_e_k_block_desc);
 
         constexpr auto b_e_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(
@@ -382,37 +379,47 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: tail
         {
-            // even iteration
-            Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
-            Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
+            constexpr bool has_two_iteration_left = (E % (2 * EPerBlock) == 0);
 
-            blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
-            blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
+            if(has_two_iteration_left) // if has 2 iteration left
+            {
+                Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
+                Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
 
-            __syncthreads();
+                blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0, 0, 0>{}, True);
+                blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
 
-            // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_wei_global, p_wei_thread_buffer);
+                __syncthreads();
 
-            // LDS double buffer: GEMM on current data
-            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
+                // LDS double buffer: load last data from device mem
+                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_in_global, p_in_thread_buffer);
+                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
+                    p_wei_global, p_wei_thread_buffer);
+
+                // LDS double buffer: GEMM on 2nd-last data
+                blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
-            // LDS double buffer: store next data to LDS
-            blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
-                                                   p_in_block_double + in_block_space);
-            blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
-                                                    p_wei_block_double + wei_block_space);
+                // LDS double buffer: store last data to LDS
+                blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
+                                                       p_in_block_double + in_block_space);
+                blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
+                                                        p_wei_block_double + wei_block_space);
+
+                __syncthreads();
 
-            // odd iteration
-            __syncthreads();
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(p_wei_block_double + wei_block_space,
+                                   p_in_block_double + in_block_space,
+                                   p_out_thread);
+            }
+            else // if has 1 iteration left
+            {
+                __syncthreads();
 
-            // LDS double buffer: GEMM on current data
-            blockwise_gemm.Run(p_wei_block_double + wei_block_space,
-                               p_in_block_double + in_block_space,
-                               p_out_thread);
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
+            }
         }
 
         // copy output: register to global memory
@@ -420,12 +427,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             constexpr index_t K1 = GemmMPerThreadSubC * GemmMLevel0Cluster * GemmMLevel1Cluster;
             constexpr index_t K0 = K / K1;
 
-            // define tensor descriptor for threadwise copy
-            //     output memory layout descriptor in register, src of threadwise copy
+            // define output tensor descriptor for threadwise copy
+            //     thread output tensor, src of threadwise copy
             constexpr auto out_k0_k1_n1_b_n2_thread_desc = make_native_tensor_descriptor_packed(
                 Sequence<GemmMRepeat, GemmMPerThreadSubC, N1, 1, N2>{});
 
-            //     output memory layout descriptor in device memory
+            //     global output tensor
             constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor(
                 out_n_k_ho_wo_global_desc,
                 make_tuple(UnMerge<Sequence<N0, N1, N2>>{},
@@ -435,7 +442,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
                 make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{}));
 
-            //     output merged global tensor descriptor, dst of threadwise copy
+            //     global output tensor, dst of threadwise copy
             constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor(
                 out_n0_n1_n2_k0_k1_ho_wo_global_desc,
                 make_tuple(PassThrough<K0>{},
diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
index d14696414..d745f69f8 100644
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor_deprecated.hpp
@@ -6,7 +6,7 @@
 namespace ck {
 
 template <class Lengths>
-__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
+__host__ __device__ constexpr auto calculate_tensor_strides_packed_deprecated(Lengths)
 {
     return reverse_inclusive_scan_sequence(
                Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
@@ -19,7 +19,7 @@ __host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths,
     constexpr index_t L_back_align =
         Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
 
-    return calculate_tensor_strides_packed_old(
+    return calculate_tensor_strides_packed_deprecated(
         Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
 }
 
@@ -186,7 +186,7 @@ struct ConstantTensorDescriptor_deprecated
     {
         Array<index_t, nDim> multi_id;
 
-        using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));
+        using PackedStrides = decltype(calculate_tensor_strides_packed_deprecated(GetLengths()));
 
         // calculate index in each of the dimensions in the order of their dimension
         static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));
@@ -467,7 +467,7 @@ struct ConstantTensorDescriptor_deprecated
 
     __host__ __device__ static constexpr auto Pack()
     {
-        using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
+        using packed_strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
         return ConstantTensorDescriptor_deprecated<Lengths, packed_strides>{};
     }
 
@@ -491,7 +491,7 @@ struct ConstantTensorDescriptor_deprecated
 template <class Lengths>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
 {
-    using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
+    using Strides = decltype(calculate_tensor_strides_packed_deprecated(Lengths{}));
     return ConstantTensorDescriptor_deprecated<Lengths, Strides>{};
 }
 
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 5ce5bc700..66dda13c4 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -8,9 +8,24 @@
 
 namespace ck {
 
+// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
+// At the bare minimun, user should be able to query the following information from a tensor
+// coordinate:
+//   1. Tensor descriptor
+//   2. Location, represented in the form of multi-index
+//   3. Location, represented in the form of the offset to the origin of the tensor
+//   4. If the location is inside invalid area or not, i.e. the padding area of an implicitly padded
+//      tensor is considered invalid, because the padding area doesn't have any physical memory
+//      allocation
+// A tensor cooridnate also provides following functionality:
+//   1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
+//      can freely move the "point of location" inside the tensor
+
+// wrapper class for NativeTensorCoordinate and TransformedTensorCoordinate
 template <typename TensorDesc>
 struct TensorCoordinate;
 
+// tensor coordinate for native tensor
 template <typename NativeTensorDesc>
 struct NativeTensorCoordinate
 {
@@ -78,12 +93,10 @@ struct NativeTensorCoordinate
         return coord;
     }
 
-#if 0 // tweaking
     __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
     {
         return tensor_desc_type::CalculateOffsetDiff(idx_diff);
     }
-#endif
 
     __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
 
@@ -96,6 +109,7 @@ struct NativeTensorCoordinate
     index_t mOffset;
 };
 
+// tensor coordinate for transformed tensor
 template <typename TransformedTensorDesc>
 struct TransformedTensorCoordinate
 {
@@ -177,10 +191,10 @@ struct TransformedTensorCoordinate
         return coord_up;
     }
 
-#if 0 // tweaking
     // Calculate offset diff without updating tensor-coordinate
     // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
     //   then all calculation can be done at compile-time.
+    // TODO: this function is not compiled to expected ISA
     __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
     {
         // For transformation of multi-index difference, not all transformation functions need to
@@ -191,7 +205,6 @@ struct TransformedTensorCoordinate
 
         return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
     }
-#endif
 
     __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
     {
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index e202f73e9..dec7e2b8d 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -7,6 +7,8 @@
 
 namespace ck {
 
+// tensor descriptor for "native tensor"
+// A "native tensor" is a "true" tensor that can be represented by Lengths and Strides
 template <typename... NativeDimensions>
 struct NativeTensorDescriptor
 {
@@ -113,12 +115,10 @@ struct NativeTensorDescriptor
 
     __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
 
-#if 0
     __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
     {
         return Tuple<>{};
     }
-#endif
 
     __host__ __device__ static constexpr bool
     IsUpperIndexMappedToValidOffset(const Index& /* idx */)
@@ -127,14 +127,11 @@ struct NativeTensorDescriptor
     }
 };
 
-// LowerTensorDescriptor
-// Transforms: Tuple<DimensionTransforms...>
-// LowerDimensionIds: Tuple<Sequence<...>>
-// UpperDimensionIds: Tuple<Sequence<...>>
-template <typename LowTensorDescriptor,
-          typename Transforms,
-          typename LowDimensionIds,
-          typename UpDimensionIds>
+// Tensor descriptor for "transformed tensor"
+template <typename LowTensorDescriptor, // NativeTensorDescriptor or TransformedTensorDescriptor
+          typename Transforms,          // Tuple<MultIndexTransforms...>
+          typename LowDimensionIds,     // Tuple<Sequence<...>>
+          typename UpDimensionIds>      // Tuple<Sequence<...>>
 struct TransformedTensorDescriptor
 {
     using type                          = TransformedTensorDescriptor;
@@ -412,6 +409,7 @@ struct TransformedTensorDescriptor
     {
 #if 0
         // create tuple of linear dimension masks, for all transformations
+        // TODO: this doesn't compile, because transform_tuples() complain about constexpr
         constexpr auto tuple_of_linear_dimension_mask =
             transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
                              Transforms{},
@@ -419,7 +417,7 @@ struct TransformedTensorDescriptor
                              UpDimensionIds{});
 #else
         // create tuple of linear dimension masks, for all transformations
-        // TODO: this is a hack, transform_tuples() doesn't compile, complain about constexpr
+        // TODO: this is a hack
         constexpr auto tuple_of_linear_dimension_mask = dummy_transform_tuples_impl(
             lambda_get_linear_dimension_mask_of_single_tranform{},
             Transforms{},
@@ -465,7 +463,7 @@ struct TransformedTensorDescriptor
 #if 0
     __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
     {
-        // not implemented
+        // TODO: not implemented
     }
 #endif
 
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 15faeaebf..38ec363a7 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -73,18 +73,22 @@ struct BlockwiseGenericTensorSliceCopy_v4
     __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
                                         ThreadBufferData* p_thread_buffer) const
     {
-#if 1
-        mThreadwiseLoad.template Run<BlockSrcData,
-                                     ThreadBufferData,
-                                     BlockSrcAddressSpace,
-                                     ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
-#else // tweaking
-        mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
-                                                                       ThreadBufferData,
-                                                                       BlockSrcAddressSpace,
-                                                                       ThreadBufferAddressSpace>(
-            p_block_src, p_thread_buffer);
-#endif
+        if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
+        {
+            mThreadwiseLoad
+                .template Run_optimized_src_address_calculation<BlockSrcData,
+                                                                ThreadBufferData,
+                                                                BlockSrcAddressSpace,
+                                                                ThreadBufferAddressSpace>(
+                    p_block_src, p_thread_buffer);
+        }
+        else
+        {
+            mThreadwiseLoad.template Run<BlockSrcData,
+                                         ThreadBufferData,
+                                         BlockSrcAddressSpace,
+                                         ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
+        }
     }
 
     template <typename ThreadBufferData,
@@ -94,18 +98,22 @@ struct BlockwiseGenericTensorSliceCopy_v4
     __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
                                          BlockDstData* p_block_dst) const
     {
-#if 1
-        mThreadwiseStore.template Run<ThreadBufferData,
-                                      BlockDstData,
-                                      ThreadBufferAddressSpace,
-                                      BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
-#else // tweaking
-        mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
-                                                                        BlockDstData,
-                                                                        ThreadBufferAddressSpace,
-                                                                        BlockDstAddressSpace>(
-            p_thread_buffer, p_block_dst);
-#endif
+        if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
+        {
+            mThreadwiseStore
+                .template Run_optimized_dst_address_calculation<ThreadBufferData,
+                                                                BlockDstData,
+                                                                ThreadBufferAddressSpace,
+                                                                BlockDstAddressSpace>(
+                    p_thread_buffer, p_block_dst);
+        }
+        else
+        {
+            mThreadwiseStore.template Run<ThreadBufferData,
+                                          BlockDstData,
+                                          ThreadBufferAddressSpace,
+                                          BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+        }
     }
 
     template <typename BlockSrcData,
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 378473e1f..8d5b035e9 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -119,7 +119,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 if(src_coord.IsUpperIndexMappedToValidOffset())
                 {
                     static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                         *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                             __buffer_load<SrcData, SrcDataPerAccess>(
                                 p_src, src_coord.GetOffset(), 0);
@@ -160,7 +160,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 if(dst_coord.IsUpperIndexMappedToValidOffset())
                 {
                     static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                         __buffer_store<DstData, DstDataPerAccess>(
                             *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                             p_dst,
@@ -194,6 +194,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on src data: Read 0 if src data is in padding area.
     // Will do padding check on dst data: No write if dst data is in paddin area.
     // This version is optimized for address calculation of src tensor
+    // TODO: this function is not compiled to expected ISA
     template <typename SrcData,
               typename DstData,
               AddressSpace SrcAddressSpace = AddressSpace::generic,
@@ -279,14 +280,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     const auto src_coord =
                         src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
 
-#if 1 // tweaking
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
                     // this is src compile-time offset
                     const index_t src_linear_offset =
-                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
+                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #else
                     // this is src compile-time offset
                     const index_t src_linear_offset =
-                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
 #endif
 
                     // Check src vector's padding situation, only check the first data in
@@ -295,7 +296,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     if(src_coord.IsUpperIndexMappedToValidOffset())
                     {
                         static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                             *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                                 __buffer_load<SrcData, SrcDataPerAccess>(
                                     p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
@@ -352,6 +353,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on src data: Read 0 if src data is in padding area.
     // Will do padding check on dst data: No write if dst data is in paddin area.
     // This version is optimized for address calculation of dst tensor
+    // TODO: this function is not compiled to expected ISA
     template <typename SrcData,
               typename DstData,
               AddressSpace SrcAddressSpace = AddressSpace::generic,
@@ -468,14 +470,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     const auto dst_coord =
                         dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
 
-#if 1 // tweaking
+#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
                     // this is dst compile-time offset
                     const index_t dst_linear_offset =
-                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
+                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #else
                     // this is dst compile-time offset
                     const index_t dst_linear_offset =
-                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
 #endif
 
                     // Check dst vector's padding situation, only check the first data in
@@ -484,7 +486,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     if(dst_coord.IsUpperIndexMappedToValidOffset())
                     {
                         static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                             __buffer_store<DstData, DstDataPerAccess>(
                                 *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                                 p_dst,
@@ -506,6 +508,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
         });
     }
 
+    __device__ static constexpr bool HasWorkingOptimizedAddressCalculation()
+    {
+#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
+        return true;
+#else
+        return false;
+#endif
+    }
+
     template <typename T, bool PositiveDirection>
     __device__ void MoveSrcSliceWindow(const T& step_sizes_,
                                        integral_constant<bool, PositiveDirection>)
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index c70929f3f..7d85b3838 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -331,7 +331,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
                         //        algorithm)
                         //     3. src_merged_offset can be runtime value (no assumption imposed)
                         static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                             vector_data = __buffer_load<SrcData, SrcDataPerAccess>(
                                 p_src, src_merged_offset, src_normal_offset);
 #else
@@ -440,7 +440,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
                     //        algorithm)
                     //     3. dst_merged_offset can be runtime value (no assumption imposed)
                     static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
-#if CK_USE_AMD_INTRINSIC && CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
                         __buffer_store<SrcData, DstDataPerAccess>(
                             vector_data, p_dst, dst_merged_offset, dst_normal_offset);
 #else
diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp
new file mode 100644
index 000000000..4bb6f2693
--- /dev/null
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -0,0 +1,284 @@
+#ifndef CK_AMD_BUFFER_ADDRESSING_HPP
+#define CK_AMD_BUFFER_ADDRESSING_HPP
+
+#include "float_type.hpp"
+
+namespace ck {
+
+// For 128bit SGPRs in buffer_load and buffer_store instructions
+// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+template <typename T>
+union BufferLoadStoreDwordConfig
+{
+    int32x4_t data;
+    T* address[2];
+    int32_t range[4];
+};
+
+__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
+                                           index_t vindex,
+                                           index_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.load");
+
+__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
+                                                index_t vindex,
+                                                index_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
+
+__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
+                                                index_t vindex,
+                                                index_t offset,
+                                                bool glc,
+                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
+
+__device__ void __llvm_amdgcn_buffer_store(float vdata,
+                                           int32x4_t rsrc,
+                                           index_t vindex,
+                                           index_t offset,
+                                           bool glc,
+                                           bool slc) __asm("llvm.amdgcn.buffer.store");
+
+__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
+                                             int32x4_t rsrc,
+                                             index_t vindex,
+                                             index_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
+
+__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
+                                             int32x4_t rsrc,
+                                             index_t vindex,
+                                             index_t offset,
+                                             bool glc,
+                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
+
+template <typename T, index_t VectorSize>
+__device__ typename vector_type<T, VectorSize>::MemoryType
+__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
+
+template <typename T, index_t VectorSize>
+__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
+                               T* p_dst_block,
+                               index_t dst_thread_data_offset,
+                               index_t dst_const_data_offset);
+
+template <>
+__device__ float __buffer_load<float, 1>(const float* p_src_block,
+                                         index_t src_thread_data_offset,
+                                         index_t src_const_data_offset)
+{
+    float dst;
+
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_load(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+
+    return dst;
+}
+
+template <>
+__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
+{
+    float2_t dst;
+
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_loadx2(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+
+    return dst;
+}
+
+template <>
+__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
+                                            index_t src_thread_data_offset,
+                                            index_t src_const_data_offset)
+{
+    float4_t dst;
+
+    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
+    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> src_block_config;
+
+    // fill in byte 0 - 1
+    src_block_config.address[0] = const_cast<float*>(p_src_block);
+    // fill in byte 2
+    src_block_config.range[2] = -1;
+    // fill in byte 3
+    src_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    dst = __llvm_amdgcn_buffer_loadx4(
+        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
+#else
+    asm volatile(
+        "\n \
+    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
+    s_waitcnt 0 \n \
+    "
+        : "=v"(dst)
+        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
+#endif
+
+    return dst;
+}
+
+template <>
+__device__ void __buffer_store<float, 1>(const float& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_store(src,
+                               dst_block_config.data,
+                               0,
+                               dst_thread_addr_offset + dst_const_addr_offset,
+                               false,
+                               false);
+#else
+    asm volatile("\n \
+    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+
+template <>
+__device__ void __buffer_store<float, 2>(const float2_t& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_storex2(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
+#else
+    asm volatile("\n \
+    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+
+template <>
+__device__ void __buffer_store<float, 4>(const float4_t& src,
+                                         float* p_dst_block,
+                                         index_t dst_thread_data_offset,
+                                         index_t dst_const_data_offset)
+{
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
+
+    BufferLoadStoreDwordConfig<float> dst_block_config;
+
+    // fill in byte 0 - 1
+    dst_block_config.address[0] = p_dst_block;
+    // fill in byte 2
+    dst_block_config.range[2] = -1;
+    // fill in byte 3
+    dst_block_config.range[3] = 0x00027000;
+
+#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+    __llvm_amdgcn_buffer_storex4(src,
+                                 dst_block_config.data,
+                                 0,
+                                 dst_thread_addr_offset + dst_const_addr_offset,
+                                 false,
+                                 false);
+#else
+    asm volatile("\n \
+    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
+    "
+                 :
+                 : "s"(dst_block_config.data),
+                   "v"(src),
+                   "v"(dst_thread_addr_offset),
+                   "s"(dst_const_addr_offset));
+#endif
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp
index e01ec6efc..588efca08 100644
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -20,8 +20,8 @@
 #include "amd_inline_asm.hpp"
 #endif
 
-#if CK_USE_AMD_INTRINSIC
-#include "amd_intrinsic.hpp"
+#if CK_USE_AMD_BUFFER_ADDRESSING
+#include "amd_buffer_addressing.hpp"
 #endif
 
 #endif
diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in
index 7800f5293..3e19b5676 100644
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -20,18 +20,18 @@
 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
 #endif
 
-// AMD llvm intrinsic
-#ifndef CK_USE_AMD_INTRINSIC
-#define CK_USE_AMD_INTRINSIC 1
+// AMD buffer addressing
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING
+#define CK_USE_AMD_BUFFER_ADDRESSING 1
 #endif
 
-#ifndef CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC
-#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 1
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
 #endif
 
 // AMD XDLOPS
 #ifndef CK_USE_AMD_XDLOPS
-#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_XDLOPS 1
 #endif
 
 #ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
@@ -40,6 +40,8 @@
 
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0

From 0e5a67f14746aadd95dcae24f215bc03a89536ca Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 9 Oct 2019 21:21:57 -0500
Subject: [PATCH 15/20] refactor

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  46 ++++----
 .../blockwise_generic_tensor_slice_copy.hpp   | 100 +++++++++++++-----
 .../threadwise_generic_tensor_slice_copy.hpp  |  18 +++-
 driver/src/driver.cpp                         |   4 +-
 4 files changed, 115 insertions(+), 53 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 18c7f9a39..289c8621b 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -100,6 +100,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
     {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto True = integral_constant<bool, true>{};
+
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        constexpr auto global_address_space =
+            integral_constant<AddressSpace, AddressSpace::global>{};
+
         static_assert(ConvDirection == ConvolutionDirection::Forward ||
                           ConvDirection == ConvolutionDirection::BackwardWeight,
                       "wrong! this kernel only support convolution forward and backward-weight");
@@ -114,13 +126,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                           0,
                       "wrong!");
 
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto True = integral_constant<bool, true>{};
-
         constexpr auto in_n_c_hi_wi_global_desc  = InGlobalDesc{};
         constexpr auto wei_k_c_y_x_global_desc   = WeiGlobalDesc{};
         constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
@@ -329,10 +334,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
-                                                                               p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
-                                                                                p_wei_block_double);
+            blockwise_in_copy.Run(
+                p_in_global, p_in_block_double, global_address_space, generic_address_space);
+            blockwise_wei_copy.Run(
+                p_wei_global, p_wei_block_double, global_address_space, generic_address_space);
         }
 
         // LDS double buffer: main body
@@ -363,10 +368,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.RunLoadThreadBuffer(
+                    p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+                blockwise_wei_copy.RunLoadThreadBuffer(
+                    p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -392,10 +397,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS double buffer: load last data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.RunLoadThreadBuffer(
+                    p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+                blockwise_wei_copy.RunLoadThreadBuffer(
+                    p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
                 // LDS double buffer: GEMM on 2nd-last data
                 blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
@@ -482,8 +487,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                                                       0,
                                                       b_thread_data_on_global,
                                                       0})
-                .template Run<AccDataType, Float, AddressSpace::generic, AddressSpace::global>(
-                    p_out_thread, p_out_global);
+                .Run(p_out_thread, p_out_global, generic_address_space, global_address_space);
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 38ec363a7..34560977c 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -68,10 +68,13 @@ struct BlockwiseGenericTensorSliceCopy_v4
 
     template <typename BlockSrcData,
               typename ThreadBufferData,
-              AddressSpace BlockSrcAddressSpace     = AddressSpace::generic,
-              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic>
-    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
-                                        ThreadBufferData* p_thread_buffer) const
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace ThreadBufferAddressSpace>
+    __device__ void
+    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                        ThreadBufferData* p_thread_buffer,
+                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
+                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
     {
         if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
         {
@@ -84,19 +87,36 @@ struct BlockwiseGenericTensorSliceCopy_v4
         }
         else
         {
-            mThreadwiseLoad.template Run<BlockSrcData,
-                                         ThreadBufferData,
-                                         BlockSrcAddressSpace,
-                                         ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
+            constexpr auto block_src_address_space =
+                integral_constant<AddressSpace, BlockSrcAddressSpace>{};
+            constexpr auto thread_buffer_address_space =
+                integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+
+            mThreadwiseLoad.Run(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
         }
     }
 
+    template <typename BlockSrcData, typename ThreadBufferData>
+    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                                        ThreadBufferData* p_thread_buffer) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
+    }
+
     template <typename ThreadBufferData,
               typename BlockDstData,
-              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic,
-              AddressSpace BlockDstAddressSpace     = AddressSpace::generic>
-    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
-                                         BlockDstData* p_block_dst) const
+              AddressSpace ThreadBufferAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                         BlockDstData* p_block_dst,
+                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
+                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
     {
         if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
         {
@@ -109,31 +129,57 @@ struct BlockwiseGenericTensorSliceCopy_v4
         }
         else
         {
-            mThreadwiseStore.template Run<ThreadBufferData,
-                                          BlockDstData,
-                                          ThreadBufferAddressSpace,
-                                          BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+            constexpr auto thread_buffer_address_space =
+                integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+            constexpr auto block_dst_address_space =
+                integral_constant<AddressSpace, BlockDstAddressSpace>{};
+
+            mThreadwiseStore.Run(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
         }
     }
 
+    template <typename ThreadBufferData, typename BlockDstData>
+    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                                         BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
+    }
+
     template <typename BlockSrcData,
               typename BlockDstData,
-              AddressSpace BlockSrcAddressSpace = AddressSpace::generic,
-              AddressSpace BlockDstAddressSpace = AddressSpace::generic>
-    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    Run(const BlockSrcData* p_block_src,
+        BlockDstData* p_block_dst,
+        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
+        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
     {
         BlockSrcData p_thread_buffer[GetThreadBufferSize()];
 
-        RunLoadThreadBuffer<BlockSrcData,
-                            BlockSrcData,
-                            BlockSrcAddressSpace,
-                            AddressSpace::generic>(p_block_src, p_thread_buffer);
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
 
         // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<BlockSrcData,
-                             BlockDstData,
-                             AddressSpace::generic,
-                             BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
+    }
+
+    template <typename BlockSrcData, typename BlockDstData>
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space);
     }
 
     template <typename T, bool PositiveDirection>
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 8d5b035e9..0cf6d4b4c 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -68,9 +68,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on dst data: No write if dst data is in paddin area.
     template <typename SrcData,
               typename DstData,
-              AddressSpace SrcAddressSpace = AddressSpace::generic,
-              AddressSpace DstAddressSpace = AddressSpace::generic>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void Run(const SrcData* p_src,
+                        DstData* p_dst,
+                        integral_constant<AddressSpace, SrcAddressSpace>,
+                        integral_constant<AddressSpace, DstAddressSpace>) const
     {
         using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
         using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -180,6 +183,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
         });
     }
 
+    template <typename SrcData, typename DstData>
+    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        Run(p_src, p_dst, generic_address_space, generic_address_space);
+    }
+
     // Modify Length to 1, if Mask is set to false
     // Used for isolating linear dimension from non-linear dimensions
     template <index_t... Lengths, index_t... Mask>
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 3a0eedc64..251a38124 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
     // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
     // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
     constexpr index_t N  = 128;
@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;

From 85bed32ec82a2aa21f41b5429657468c7c0c807b Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 9 Oct 2019 21:52:22 -0500
Subject: [PATCH 16/20] refactor

---
 ...kcyx_nkhw_lds_double_buffer_deprecated.hpp |  49 ++-
 ...e_generic_tensor_slice_copy_deprecated.hpp | 106 +++--
 ...e_generic_tensor_slice_copy_deprecated.hpp |  18 +-
 .../include/utility/amd_intrinsic.hpp         | 368 ------------------
 ...it_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp |   2 +
 driver/src/driver.cpp                         |   6 +-
 6 files changed, 144 insertions(+), 405 deletions(-)
 delete mode 100644 composable_kernel/include/utility/amd_intrinsic.hpp

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
index db92631a3..3e5935dc5 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
@@ -8,18 +8,51 @@
 #include "blockwise_generic_tensor_slice_copy_deprecated.hpp"
 #include "blockwise_gemm.hpp"
 #include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
+#include "convolution_common.hpp"
 
 namespace ck {
 
+template <ConvolutionDirection>
+struct make_wei_e_k_global_desc_v4r1_deprecated;
+
+template <>
+struct make_wei_e_k_global_desc_v4r1_deprecated<ConvolutionDirection::Forward>
+{
+    template <typename WeiDesc>
+    __device__ constexpr auto operator()(WeiDesc) const
+    {
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I3 = Number<3>{};
+
+        return WeiDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+    }
+};
+
+template <>
+struct make_wei_e_k_global_desc_v4r1_deprecated<ConvolutionDirection::BackwardWeight>
+{
+    template <typename WeiDesc>
+    __device__ constexpr auto operator()(WeiDesc) const
+    {
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        return make_ConstantMergedTensorDescriptor(
+            WeiDesc::Unfold(I2, I3), Sequence<1, 2>{}, Sequence<0>{});
+    }
+};
+
 // define B = merge(N0, Ho, Wo)
 template <index_t GridSize,
           index_t BlockSize,
           class Float,
+          class AccDataType,
           class InGlobalDesc,
           class WeiGlobalDesc,
           class OutGlobalDesc,
           class ConvStrides,
           class ConvDilations,
+          ConvolutionDirection ConvDirection,
           index_t BPerBlock,
           index_t KPerBlock,
           index_t EPerBlock,
@@ -53,6 +86,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
     {
+        static_assert(ConvDirection == ConvolutionDirection::Forward ||
+                          ConvDirection == ConvolutionDirection::BackwardWeight,
+                      "wrong! this kernel only support convolution forward and backward-weight");
+
         // this is a mess
         // TODO: find more elegent way of specifying (or calculating) performance parameters
         constexpr index_t N1 = GemmNRepeat;
@@ -172,9 +209,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
             InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 
         // weight tensor
-        //     tensor descriptor in device memory, src of blockwise copy
+        //     Iensor descriptor in device memory, src of blockwise copy
+        //     It is constructed differently, depending on whether forward or backward weight
+        //       convolution
         constexpr auto wei_e_k_global_desc =
-            wei_k_c_y_x_global_desc.Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+            make_wei_e_k_global_desc_v4r1_deprecated<ConvDirection>{}(wei_k_c_y_x_global_desc);
 
         //     tensor descriptor in LDS, dst of blockwise copy
         //     be careful of LDS alignment
@@ -256,7 +295,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
         __shared__ Float p_wei_block_double[2 * wei_block_space];
 
         // register allocation for output
-        Float p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];
+        AccDataType p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];
 
         // zero out threadwise output
         threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread);
@@ -394,11 +433,11 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                     0,
                     b_thread_data_on_global,
                     0})
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>(
+                .template Run<AccDataType, Float, AddressSpace::generic, AddressSpace::global>(
                     p_out_thread, p_out_global);
         }
     }
 };
 
 } // namespace ck
-#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP
+#endif // CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER_DEPRECATED_HPP
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index 399a47407..2272ab017 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -475,42 +475,96 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
         return ThreadBufferDesc::GetElementSpace();
     }
 
-    template <typename SrcData,
-              typename DstData,
-              AddressSpace BlockSrcAddressSpace     = AddressSpace::generic,
-              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic>
-    __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
+    template <typename BlockSrcData,
+              typename ThreadBufferData,
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace ThreadBufferAddressSpace>
+    __device__ void
+    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                        ThreadBufferData* p_thread_buffer,
+                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
+                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
+    {
+        constexpr auto block_src_address_space =
+            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+
+        mThreadwiseLoad.Run(
+            p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
+    }
+
+    template <typename BlockSrcData, typename ThreadBufferData>
+    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
+                                        ThreadBufferData* p_thread_buffer) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
+    }
+
+    template <typename ThreadBufferData,
+              typename BlockDstData,
+              AddressSpace ThreadBufferAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                         BlockDstData* p_block_dst,
+                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
+                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
     {
-        mThreadwiseLoad
-            .template Run<SrcData, DstData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
-                p_block_src, p_thread_buffer);
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr auto block_dst_address_space =
+            integral_constant<AddressSpace, BlockDstAddressSpace>{};
+
+        mThreadwiseStore.Run(
+            p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
     }
 
-    template <typename SrcData,
-              typename DstData,
-              AddressSpace ThreadBufferAddressSpace = AddressSpace::generic,
-              AddressSpace BlockDstAddressSpace     = AddressSpace::generic>
-    __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
+    template <typename ThreadBufferData, typename BlockDstData>
+    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
+                                         BlockDstData* p_block_dst) const
     {
-        mThreadwiseStore
-            .template Run<SrcData, DstData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
-                p_thread_buffer, p_block_dst);
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
     }
 
-    template <typename SrcData,
-              typename DstData,
-              AddressSpace BlockSrcAddressSpace = AddressSpace::generic,
-              AddressSpace BlockDstAddressSpace = AddressSpace::generic>
-    __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
+    template <typename BlockSrcData,
+              typename BlockDstData,
+              AddressSpace BlockSrcAddressSpace,
+              AddressSpace BlockDstAddressSpace>
+    __device__ void
+    Run(const BlockSrcData* p_block_src,
+        BlockDstData* p_block_dst,
+        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
+        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
     {
-        SrcData p_thread_buffer[GetThreadBufferSize()];
+        BlockSrcData p_thread_buffer[GetThreadBufferSize()];
+
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
 
-        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, AddressSpace::generic>(
-            p_block_src, p_thread_buffer);
+        RunLoadThreadBuffer(
+            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
 
         // if there is type conversion, it's done during store
-        RunStoreThreadBuffer<SrcData, DstData, AddressSpace::generic, BlockDstAddressSpace>(
-            p_thread_buffer, p_block_dst);
+        RunStoreThreadBuffer(
+            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
+    }
+
+    template <typename BlockSrcData, typename BlockDstData>
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space);
     }
 
     template <typename T, bool PositiveDirection>
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index 7d85b3838..ceee79ca6 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -256,9 +256,12 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
 
     template <typename SrcData,
               typename DstData,
-              AddressSpace SrcAddressSpace = AddressSpace::generic,
-              AddressSpace DstAddressSpace = AddressSpace::generic>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void Run(const SrcData* p_src,
+                        DstData* p_dst,
+                        integral_constant<AddressSpace, SrcAddressSpace>,
+                        integral_constant<AddressSpace, DstAddressSpace>) const
     {
         constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
 
@@ -457,6 +460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
         }
     }
 
+    template <typename SrcData, typename DstData>
+    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
+    {
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+
+        Run(p_src, p_dst, generic_address_space, generic_address_space);
+    }
+
     // T can be Sequence or Array
     template <typename T, bool PositiveDirection>
     __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp
deleted file mode 100644
index 2575cbc40..000000000
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-#ifndef CK_AMD_INTRINSIC_HPP
-#define CK_AMD_INTRINSIC_HPP
-
-#include "float_type.hpp"
-
-namespace ck {
-
-// for buffer_load and buffer_store
-template <typename T>
-union BufferLoadStoreDwordConfig
-{
-    int32x4_t data;
-    T* address[2];
-    int32_t range[4];
-};
-
-__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
-                                           index_t vindex,
-                                           index_t offset,
-                                           bool glc,
-                                           bool slc) __asm("llvm.amdgcn.buffer.load");
-
-__device__ float2_t __llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
-                                                index_t vindex,
-                                                index_t offset,
-                                                bool glc,
-                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
-
-__device__ float4_t __llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
-                                                index_t vindex,
-                                                index_t offset,
-                                                bool glc,
-                                                bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
-
-__device__ void __llvm_amdgcn_buffer_store(float vdata,
-                                           int32x4_t rsrc,
-                                           index_t vindex,
-                                           index_t offset,
-                                           bool glc,
-                                           bool slc) __asm("llvm.amdgcn.buffer.store");
-
-__device__ void __llvm_amdgcn_buffer_storex2(float2_t vdata,
-                                             int32x4_t rsrc,
-                                             index_t vindex,
-                                             index_t offset,
-                                             bool glc,
-                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
-
-__device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
-                                             int32x4_t rsrc,
-                                             index_t vindex,
-                                             index_t offset,
-                                             bool glc,
-                                             bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
-
-// buffer_load and buffer_store
-template <typename T, index_t VectorSize>
-__device__ typename vector_type<T, VectorSize>::MemoryType
-__buffer_load(const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset);
-
-template <typename T, index_t VectorSize>
-__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
-                               T* p_dst_block,
-                               index_t dst_thread_data_offset,
-                               index_t dst_const_data_offset);
-
-template <>
-__device__ float __buffer_load<float, 1>(const float* p_src_block,
-                                         index_t src_thread_data_offset,
-                                         index_t src_const_data_offset)
-{
-#if 0
-    float dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    asm volatile(
-        "\n \
-    buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-        : "=v"(dst)
-        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
-
-    return dst;
-#else
-    float dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_load(
-        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ float2_t __buffer_load<float, 2>(const float* p_src_block,
-                                            index_t src_thread_data_offset,
-                                            index_t src_const_data_offset)
-{
-#if 0
-    float2_t dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    asm volatile(
-        "\n \
-    buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-        : "=v"(dst)
-        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
-
-    return dst;
-#else
-    float2_t dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_loadx2(
-        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ float4_t __buffer_load<float, 4>(const float* p_src_block,
-                                            index_t src_thread_data_offset,
-                                            index_t src_const_data_offset)
-{
-#if 0
-    float4_t dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    asm volatile(
-        "\n \
-    buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
-    s_waitcnt 0 \n \
-    "
-        : "=v"(dst)
-        : "v"(src_thread_addr_offset), "s"(src_block_config.data), "s"(src_const_addr_offset));
-
-    return dst;
-#else
-    float4_t dst;
-
-    index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
-    index_t src_const_addr_offset  = src_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> src_block_config;
-
-    // fill in byte 0 - 1
-    src_block_config.address[0] = const_cast<float*>(p_src_block);
-    // fill in byte 2
-    src_block_config.range[2] = -1;
-    // fill in byte 3
-    src_block_config.range[3] = 0x00027000;
-
-    dst = __llvm_amdgcn_buffer_loadx4(
-        src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
-
-    return dst;
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 1>(const float& src,
-                                         float* p_dst_block,
-                                         index_t dst_thread_data_offset,
-                                         index_t dst_const_data_offset)
-{
-#if 0
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_config.data),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_store(src,
-                               dst_block_config.data,
-                               0,
-                               dst_thread_addr_offset + dst_const_addr_offset,
-                               false,
-                               false);
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 2>(const float2_t& src,
-                                         float* p_dst_block,
-                                         index_t dst_thread_data_offset,
-                                         index_t dst_const_data_offset)
-{
-#if 0
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_config.data),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex2(src,
-                                 dst_block_config.data,
-                                 0,
-                                 dst_thread_addr_offset + dst_const_addr_offset,
-                                 false,
-                                 false);
-#endif
-}
-
-template <>
-__device__ void __buffer_store<float, 4>(const float4_t& src,
-                                         float* p_dst_block,
-                                         index_t dst_thread_data_offset,
-                                         index_t dst_const_data_offset)
-{
-#if 0
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    asm volatile("\n \
-    buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
-    "
-                 :
-                 : "s"(dst_block_config.data),
-                   "v"(src),
-                   "v"(dst_thread_addr_offset),
-                   "s"(dst_const_addr_offset));
-#else
-    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
-    index_t dst_const_addr_offset  = dst_const_data_offset * sizeof(float);
-
-    BufferLoadStoreDwordConfig<float> dst_block_config;
-
-    // fill in byte 0 - 1
-    dst_block_config.address[0] = p_dst_block;
-    // fill in byte 2
-    dst_block_config.range[2] = -1;
-    // fill in byte 3
-    dst_block_config.range[3] = 0x00027000;
-
-    __llvm_amdgcn_buffer_storex4(src,
-                                 dst_block_config.data,
-                                 0,
-                                 dst_thread_addr_offset + dst_const_addr_offset,
-                                 false,
-                                 false);
-#endif
-}
-
-} // namespace ck
-#endif
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
index 5a47feb6e..626dd77dd 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
@@ -174,11 +174,13 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
             GridSize,
             BlockSize,
             T,
+            T,
             decltype(in_nchw_desc),
             decltype(wei_kcyx_desc),
             decltype(out_nkhw_desc),
             ConvStrides,
             ConvDilations,
+            ConvolutionDirection::Forward,
             BPerBlock,
             KPerBlock,
             EPerBlock,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 251a38124..67fa14db5 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -297,7 +297,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<0, 0>;
     using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
     // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
     // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
     constexpr index_t N  = 128;
@@ -343,7 +343,7 @@ int main(int argc, char* argv[])
 
     using LeftPads  = Sequence<3, 0>;
     using RightPads = Sequence<3, 0>;
-#elif 1
+#elif 0
     // 1x7 filter, 0x3 pad, 17x17 input
     constexpr index_t N  = 128;
     constexpr index_t C  = 128;
@@ -482,7 +482,7 @@ int main(int argc, char* argv[])
                                                          ConvStrides{},
                                                          ConvDilations{},
                                                          nrepeat);
-#elif 0
+#elif 1
     device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                     in_nchw,
                                                                     wei_kcyx_desc,

From 89f2cb4a9a8a1db1d8757b7efe2cc7332793734d Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 9 Oct 2019 22:10:52 -0500
Subject: [PATCH 17/20] refactor

---
 ...kcyx_nkhw_lds_double_buffer_deprecated.hpp | 46 ++++++++++---------
 .../blockwise_generic_tensor_slice_copy.hpp   | 36 ++++++---------
 .../threadwise_generic_tensor_slice_copy.hpp  | 22 +++++----
 3 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
index 3e5935dc5..b5fde21c9 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
@@ -86,6 +86,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
     {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto True = integral_constant<bool, true>{};
+
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        constexpr auto global_address_space =
+            integral_constant<AddressSpace, AddressSpace::global>{};
+
         static_assert(ConvDirection == ConvolutionDirection::Forward ||
                           ConvDirection == ConvolutionDirection::BackwardWeight,
                       "wrong! this kernel only support convolution forward and backward-weight");
@@ -100,13 +112,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                           0,
                       "wrong!");
 
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto True = integral_constant<bool, true>{};
-
         constexpr auto in_n_c_h_w_global_desc  = InGlobalDesc{};
         constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
         constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{};
@@ -302,10 +307,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
-                                                                               p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
-                                                                                p_wei_block_double);
+            blockwise_in_copy.Run(
+                p_in_global, p_in_block_double, global_address_space, generic_address_space);
+            blockwise_wei_copy.Run(
+                p_wei_global, p_wei_block_double, global_address_space, generic_address_space);
         }
 
         // LDS double buffer: main body
@@ -336,10 +341,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.RunLoadThreadBuffer(
+                    p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+                blockwise_wei_copy.RunLoadThreadBuffer(
+                    p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -362,10 +367,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_wei_global, p_wei_thread_buffer);
+            blockwise_in_copy.RunLoadThreadBuffer(
+                p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+            blockwise_wei_copy.RunLoadThreadBuffer(
+                p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
             // LDS double buffer: GEMM on current data
             blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
@@ -433,8 +438,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
                     0,
                     b_thread_data_on_global,
                     0})
-                .template Run<AccDataType, Float, AddressSpace::generic, AddressSpace::global>(
-                    p_out_thread, p_out_global);
+                .Run(p_out_thread, p_out_global, generic_address_space, global_address_space);
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 34560977c..b50e27ed6 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -76,22 +76,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
                         integral_constant<AddressSpace, BlockSrcAddressSpace>,
                         integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
     {
+        constexpr auto block_src_address_space =
+            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+
         if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
         {
-            mThreadwiseLoad
-                .template Run_optimized_src_address_calculation<BlockSrcData,
-                                                                ThreadBufferData,
-                                                                BlockSrcAddressSpace,
-                                                                ThreadBufferAddressSpace>(
-                    p_block_src, p_thread_buffer);
+            mThreadwiseLoad.Run_optimized_src_address_calculation(
+                p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
         }
         else
         {
-            constexpr auto block_src_address_space =
-                integral_constant<AddressSpace, BlockSrcAddressSpace>{};
-            constexpr auto thread_buffer_address_space =
-                integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
-
             mThreadwiseLoad.Run(
                 p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
         }
@@ -118,22 +114,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
                          integral_constant<AddressSpace, ThreadBufferAddressSpace>,
                          integral_constant<AddressSpace, BlockDstAddressSpace>) const
     {
+        constexpr auto thread_buffer_address_space =
+            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
+        constexpr auto block_dst_address_space =
+            integral_constant<AddressSpace, BlockDstAddressSpace>{};
+
         if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
         {
-            mThreadwiseStore
-                .template Run_optimized_dst_address_calculation<ThreadBufferData,
-                                                                BlockDstData,
-                                                                ThreadBufferAddressSpace,
-                                                                BlockDstAddressSpace>(
-                    p_thread_buffer, p_block_dst);
+            mThreadwiseStore.Run_optimized_dst_address_calculation(
+                p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
         }
         else
         {
-            constexpr auto thread_buffer_address_space =
-                integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
-            constexpr auto block_dst_address_space =
-                integral_constant<AddressSpace, BlockDstAddressSpace>{};
-
             mThreadwiseStore.Run(
                 p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
         }
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 0cf6d4b4c..db70cbee0 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -209,10 +209,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // TODO: this function is not compiled to expected ISA
     template <typename SrcData,
               typename DstData,
-              AddressSpace SrcAddressSpace = AddressSpace::generic,
-              AddressSpace DstAddressSpace = AddressSpace::generic>
-    __device__ void Run_optimized_src_address_calculation(const SrcData* p_src,
-                                                          DstData* p_dst) const
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void
+    Run_optimized_src_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
     {
         using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
         using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
@@ -368,10 +371,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // TODO: this function is not compiled to expected ISA
     template <typename SrcData,
               typename DstData,
-              AddressSpace SrcAddressSpace = AddressSpace::generic,
-              AddressSpace DstAddressSpace = AddressSpace::generic>
-    __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src,
-                                                          DstData* p_dst) const
+              AddressSpace SrcAddressSpace,
+              AddressSpace DstAddressSpace>
+    __device__ void
+    Run_optimized_dst_address_calculation(const SrcData* p_src,
+                                          DstData* p_dst,
+                                          integral_constant<AddressSpace, SrcAddressSpace>,
+                                          integral_constant<AddressSpace, DstAddressSpace>) const
     {
         using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
         using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;

From 871607a9ddbf5c7b493a6ef2a7d42c44701659dd Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 10 Oct 2019 02:39:20 -0500
Subject: [PATCH 18/20] nvidia build

---
 .../blockwise_generic_tensor_slice_copy.hpp          | 12 ++++++++++--
 .../include/utility/config.nvidia.hpp.in             |  7 ++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index b50e27ed6..8939ae337 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -81,7 +81,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
         constexpr auto thread_buffer_address_space =
             integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
 
-        if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
         {
             mThreadwiseLoad.Run_optimized_src_address_calculation(
                 p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
@@ -119,7 +123,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
         constexpr auto block_dst_address_space =
             integral_constant<AddressSpace, BlockDstAddressSpace>{};
 
-        if(mThreadwiseStore.HasWorkingOptimizedAddressCalculation())
+        constexpr bool has_optimized_address_calculation =
+            decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation();
+
+        // TODO: threadwise copy is still being tweaked
+        if(has_optimized_address_calculation)
         {
             mThreadwiseStore.Run_optimized_dst_address_calculation(
                 p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
diff --git a/composable_kernel/include/utility/config.nvidia.hpp.in b/composable_kernel/include/utility/config.nvidia.hpp.in
index 6e9198893..7c549cda5 100644
--- a/composable_kernel/include/utility/config.nvidia.hpp.in
+++ b/composable_kernel/include/utility/config.nvidia.hpp.in
@@ -15,15 +15,16 @@
 // disable AMD inline asm and intrinsic
 #define CK_USE_AMD_INLINE_ASM 0
 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_BUFFER_ADDRESSING 0
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
 #define CK_USE_AMD_XDLOPS 0
 #define CK_USE_AMD_XDLOPS_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC 0
-#define CK_BUFFER_LOAD_STORE_USE_AMD_INTRINSIC 0
 
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 

From f489a603d73a83d05dc7082079755db7214655e3 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 10 Oct 2019 15:45:36 -0500
Subject: [PATCH 19/20] refactor

---
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 90 +++++++++++--------
 .../threadwise_generic_tensor_slice_copy.hpp  | 14 ++-
 ...e_generic_tensor_slice_copy_deprecated.hpp |  8 +-
 driver/src/driver.cpp                         |  4 +-
 4 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index a547db7e3..30984136d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -61,6 +61,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         constexpr auto True = integral_constant<bool, true>{};
 
+        constexpr auto generic_address_space =
+            integral_constant<AddressSpace, AddressSpace::generic>{};
+        constexpr auto global_address_space =
+            integral_constant<AddressSpace, AddressSpace::global>{};
+
         constexpr auto in_n_c_hi_wi_global_desc =
             make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
         constexpr auto wei_k_c_y_x_global_desc =
@@ -96,7 +101,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                       "be violated");
 
         // divide block work by [K, B]
-        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0,
+        static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % EPerBlock == 0,
                       "wrong! cannot divide work evenly among block");
 
         constexpr index_t KBlockWork = K / KPerBlock;
@@ -255,10 +260,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, Float, AddressSpace::global>(p_in_global,
-                                                                               p_in_block_double);
-            blockwise_wei_copy.template Run<Float, Float, AddressSpace::global>(p_wei_global,
-                                                                                p_wei_block_double);
+            blockwise_in_copy.Run(
+                p_in_global, p_in_block_double, global_address_space, generic_address_space);
+            blockwise_wei_copy.Run(
+                p_wei_global, p_wei_block_double, global_address_space, generic_address_space);
         }
 
         // LDS double buffer: main body
@@ -289,10 +294,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                    p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy.RunLoadThreadBuffer(
+                    p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+                blockwise_wei_copy.RunLoadThreadBuffer(
+                    p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -305,37 +310,47 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: tail
         {
-            Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
-            Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
+            constexpr bool has_two_iteration_left = (E % (2 * EPerBlock) == 0);
 
-            // even iteration
-            blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
-            blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
+            if(has_two_iteration_left) // if has 2 iteration left
+            {
+                Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()];
+                Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()];
 
-            __syncthreads();
+                blockwise_in_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
+                blockwise_wei_copy.MoveSrcSliceWindow(Sequence<EPerBlock, 0>{}, True);
 
-            // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, AddressSpace::global>(
-                p_wei_global, p_wei_thread_buffer);
+                __syncthreads();
 
-            // LDS double buffer: GEMM on current data
-            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
+                // LDS double buffer: load last data from device mem
+                blockwise_in_copy.RunLoadThreadBuffer(
+                    p_in_global, p_in_thread_buffer, global_address_space, generic_address_space);
+                blockwise_wei_copy.RunLoadThreadBuffer(
+                    p_wei_global, p_wei_thread_buffer, global_address_space, generic_address_space);
 
-            // LDS double buffer: store next data to LDS
-            blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
-                                                   p_in_block_double + in_block_space);
-            blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
-                                                    p_wei_block_double + wei_block_space);
+                // LDS double buffer: GEMM on 2nd-last data
+                blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
-            // odd iteration
-            __syncthreads();
+                // LDS double buffer: store last data to LDS
+                blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer,
+                                                       p_in_block_double + in_block_space);
+                blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer,
+                                                        p_wei_block_double + wei_block_space);
 
-            // LDS double buffer: GEMM on current data
-            blockwise_gemm.Run(p_wei_block_double + wei_block_space,
-                               p_in_block_double + in_block_space,
-                               p_out_thread);
+                __syncthreads();
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(p_wei_block_double + wei_block_space,
+                                   p_in_block_double + in_block_space,
+                                   p_out_thread);
+            }
+            else // if has 1 iteration left
+            {
+                __syncthreads();
+
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
+            }
         }
 
         // copy output: register to global memory
@@ -388,14 +403,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                                                b_thread_data_on_global / B1,
                                                b_thread_data_on_global % B1})
 #if 1
-                .template Run<Float, Float, AddressSpace::generic, AddressSpace::global>
+                .Run(p_out_thread, p_out_global, generic_address_space, global_address_space);
 #else // tweaking
-                .template Run_optimized_dst_address_calculation<Float,
-                                                                Float,
-                                                                AddressSpace::generic,
-                                                                AddressSpace::global>
+                .Run_optimized_dst_address_calculation(
+                    p_out_thread, p_out_global, generic_address_space, global_address_space);
 #endif
-                (p_out_thread, p_out_global);
         }
     }
 };
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index db70cbee0..1e3095d72 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -117,15 +117,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 
                 // Check src vector's padding situation, only check the first data in this src
                 //   vector. It's user's responsiblity to make sure all data in the src vector
-                //   has
-                //   the same padding situation
+                //   has the same padding situation
                 if(src_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+                    static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
 #if CK_USE_AMD_BUFFER_ADDRESSING
                         *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                             __buffer_load<SrcData, SrcDataPerAccess>(
-                                p_src, src_coord.GetOffset(), 0);
+                                fwd(p_src), src_coord.GetOffset(), 0);
 #else
                         *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                             *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
@@ -158,15 +157,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 
                 // Check dst vector's padding situation, only check the first data in this dst
                 //   vector. It's user's responsiblity to make sure all data in the dst vector
-                //   has
-                //   the same padding situation
+                //   has the same padding situation
                 if(dst_coord.IsUpperIndexMappedToValidOffset())
                 {
-                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
 #if CK_USE_AMD_BUFFER_ADDRESSING
                         __buffer_store<DstData, DstDataPerAccess>(
                             *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
-                            p_dst,
+                            fwd(p_dst),
                             dst_coord.GetOffset(),
                             0);
 #else
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
index ceee79ca6..f28ac1892 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
@@ -333,10 +333,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
                         //     2. src_normal_offset must be calculatd at compile time (guaranteed by
                         //        algorithm)
                         //     3. src_merged_offset can be runtime value (no assumption imposed)
-                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
+                        static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
 #if CK_USE_AMD_BUFFER_ADDRESSING
                             vector_data = __buffer_load<SrcData, SrcDataPerAccess>(
-                                p_src, src_merged_offset, src_normal_offset);
+                                fwd(p_src), src_merged_offset, src_normal_offset);
 #else
                             vector_data = *reinterpret_cast<const src_vector_t*>(
                                 &p_src[src_normal_offset + src_merged_offset]);
@@ -442,10 +442,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
                     //     2. dst_normal_offset must be calculatd at compile time (guaranteed by
                     //        algorithm)
                     //     3. dst_merged_offset can be runtime value (no assumption imposed)
-                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
+                    static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
 #if CK_USE_AMD_BUFFER_ADDRESSING
                         __buffer_store<SrcData, DstDataPerAccess>(
-                            vector_data, p_dst, dst_merged_offset, dst_normal_offset);
+                            vector_data, fwd(p_dst), dst_merged_offset, dst_normal_offset);
 #else
                         *reinterpret_cast<dst_vector_t*>(
                             &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 67fa14db5..dccad8a5e 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
                                                                     ConvStrides{},
                                                                     ConvDilations{},
                                                                     nrepeat);
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
                                                          wei_kcyx_desc,
@@ -482,7 +482,7 @@ int main(int argc, char* argv[])
                                                          ConvStrides{},
                                                          ConvDilations{},
                                                          nrepeat);
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
                                                                     in_nchw,
                                                                     wei_kcyx_desc,

From b03aabf11527d0e26665a334b64c1b4e65a1b65f Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 11 Oct 2019 11:29:15 -0500
Subject: [PATCH 20/20] rename, fix type

---
 ...icit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 +++++-----
 .../blockwise_generic_tensor_slice_copy.hpp            |  2 +-
 .../blockwise_generic_tensor_slice_copy_deprecated.hpp |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 289c8621b..95fbeb290 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -293,14 +293,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<N1 * N2>{});
+        constexpr auto c_k0k1_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
+            Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<GemmNRepeat * GemmNPerThreadSubC>{});
 
         const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
             BlockSize,
             decltype(a_e_k_block_mtx_desc),
             decltype(b_e_n1bn2_block_mtx_desc),
-            decltype(c_k0k2_n1n2_thread_mtx_desc),
+            decltype(c_k0k1_n1n2_thread_mtx_desc),
             GemmMPerThreadSubC,
             GemmNPerThreadSubC,
             GemmMLevel0Cluster,
@@ -327,10 +327,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         __shared__ Float p_wei_block_double[2 * wei_block_space];
 
         // register allocation for output
-        AccDataType p_out_thread[c_k0k2_n1n2_thread_mtx_desc.GetElementSpace()];
+        AccDataType p_out_thread[c_k0k1_n1n2_thread_mtx_desc.GetElementSpace()];
 
         // zero out threadwise output
-        threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread);
+        threadwise_matrix_set_zero(c_k0k1_n1n2_thread_mtx_desc, p_out_thread);
 
         // LDS double buffer: preload data into LDS
         {
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 8939ae337..d31b3902d 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -179,7 +179,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
         constexpr auto generic_address_space =
             integral_constant<AddressSpace, AddressSpace::generic>{};
 
-        Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space);
+        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
     }
 
     template <typename T, bool PositiveDirection>
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
index 2272ab017..c434e82f0 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -564,7 +564,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
         constexpr auto generic_address_space =
             integral_constant<AddressSpace, AddressSpace::generic>{};
 
-        Rnun(p_block_src, p_block_dst, generic_address_space, generic_address_space);
+        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
     }
 
     template <typename T, bool PositiveDirection>