asroy · asroy · Sep 30, 2019 · Sep 30, 2019
diff --git a/...el_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/...el_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -265,10 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.template Run<Float, address_space_t::global>(p_in_global,
-                                                                           p_in_block_double);
-            blockwise_wei_copy.template Run<Float, address_space_t::global>(p_wei_global,
-                                                                            p_wei_block_double);
+            blockwise_in_copy.template Run<Float, Float, address_space_t::global>(
+                p_in_global, p_in_block_double);
+            blockwise_wei_copy.template Run<Float, Float, address_space_t::global>(
+                p_wei_global, p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -299,10 +299,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
-                    p_in_global, p_in_thread_buffer);
-                blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
-                    p_wei_global, p_wei_thread_buffer);
+                blockwise_in_copy
+                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                        p_in_global, p_in_thread_buffer);
+                blockwise_wei_copy
+                    .template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
+                        p_wei_global, p_wei_thread_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
@@ -325,9 +327,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+            blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
                 p_in_global, p_in_thread_buffer);
-            blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
+            blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
                 p_wei_global, p_wei_thread_buffer);
 
             // LDS double buffer: GEMM on current data
@@ -396,7 +398,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                     0,
                     b_thread_data_on_global,
                     0})
-                .template Run<Float, address_space_t::generic, address_space_t::global>(
+                .template Run<Float, Float, address_space_t::generic, address_space_t::global>(
                     p_out_thread, p_out_global);
         }
     }

diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -120,6 +120,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
                             BlockSrcData,
                             BlockSrcAddressSpace,
                             address_space_t::generic>(p_block_src, p_thread_buffer);
+
+        // if there is type conversion, it's done during store
         RunStoreThreadBuffer<BlockSrcData,
                              BlockDstData,
                              address_space_t::generic,

diff --git a/...osable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp b/...osable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -478,35 +478,42 @@ struct BlockwiseGenericTensorSliceCopy_v2
         return ThreadBufferDesc::GetElementSpace();
     }
 
-    template <typename TData,
+    template <typename SrcData,
+              typename DstData,
               address_space_t BlockSrcAddressSpace     = address_space_t::generic,
               address_space_t ThreadBufferAddressSpace = address_space_t::generic>
-    __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
+    __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
     {
-        mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
-            p_block_src, p_thread_buffer);
+        mThreadwiseLoad
+            .template Run<SrcData, DstData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
+                p_block_src, p_thread_buffer);
     }
 
-    template <typename TData,
+    template <typename SrcData,
+              typename DstData,
               address_space_t ThreadBufferAddressSpace = address_space_t::generic,
               address_space_t BlockDstAddressSpace     = address_space_t::generic>
-    __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
+    __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
     {
-        mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
-            p_thread_buffer, p_block_dst);
+        mThreadwiseStore
+            .template Run<SrcData, DstData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
+                p_thread_buffer, p_block_dst);
     }
 
-    template <typename TData,
+    template <typename SrcData,
+              typename DstData,
               address_space_t BlockSrcAddressSpace = address_space_t::generic,
               address_space_t BlockDstAddressSpace = address_space_t::generic>
-    __device__ void Run(const TData* p_block_src, TData* p_block_dst) const
+    __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
     {
-        TData p_thread_buffer[GetThreadBufferSize()];
+        SrcData p_thread_buffer[GetThreadBufferSize()];
 
-        RunLoadThreadBuffer<TData, BlockSrcAddressSpace, address_space_t::generic>(p_block_src,
-                                                                                   p_thread_buffer);
-        RunStoreThreadBuffer<TData, address_space_t::generic, BlockDstAddressSpace>(p_thread_buffer,
-                                                                                    p_block_dst);
+        RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, address_space_t::generic>(
+            p_block_src, p_thread_buffer);
+
+        // if there is type conversion, it's done during store
+        RunStoreThreadBuffer<SrcData, DstData, address_space_t::generic, BlockDstAddressSpace>(
+            p_thread_buffer, p_block_dst);
     }
 
     template <typename T, bool PositiveDirection>