Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer

// LDS double buffer: preload data into LDS
{
blockwise_in_copy.template Run<Float, address_space_t::global>(p_in_global,
p_in_block_double);
blockwise_wei_copy.template Run<Float, address_space_t::global>(p_wei_global,
p_wei_block_double);
blockwise_in_copy.template Run<Float, Float, address_space_t::global>(
p_in_global, p_in_block_double);
blockwise_wei_copy.template Run<Float, Float, address_space_t::global>(
p_wei_global, p_wei_block_double);
}

// LDS double buffer: main body
Expand Down Expand Up @@ -299,10 +299,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads();

// LDS doubel buffer: load next data from device mem
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
p_in_global, p_in_thread_buffer);
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
p_wei_global, p_wei_thread_buffer);
blockwise_in_copy
.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
p_in_global, p_in_thread_buffer);
blockwise_wei_copy
.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
p_wei_global, p_wei_thread_buffer);

// LDS double buffer: GEMM on current data
blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
Expand All @@ -325,9 +327,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads();

// LDS doubel buffer: load next data from device mem
blockwise_in_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
blockwise_in_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
p_in_global, p_in_thread_buffer);
blockwise_wei_copy.template RunLoadThreadBuffer<Float, address_space_t::global>(
blockwise_wei_copy.template RunLoadThreadBuffer<Float, Float, address_space_t::global>(
p_wei_global, p_wei_thread_buffer);

// LDS double buffer: GEMM on current data
Expand Down Expand Up @@ -396,7 +398,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0,
b_thread_data_on_global,
0})
.template Run<Float, address_space_t::generic, address_space_t::global>(
.template Run<Float, Float, address_space_t::generic, address_space_t::global>(
p_out_thread, p_out_global);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ struct BlockwiseGenericTensorSliceCopy_v4
BlockSrcData,
BlockSrcAddressSpace,
address_space_t::generic>(p_block_src, p_thread_buffer);

// if there is type conversion, it's done during store
RunStoreThreadBuffer<BlockSrcData,
BlockDstData,
address_space_t::generic,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -478,35 +478,42 @@ struct BlockwiseGenericTensorSliceCopy_v2
return ThreadBufferDesc::GetElementSpace();
}

template <typename TData,
template <typename SrcData,
typename DstData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
__device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const
{
mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
mThreadwiseLoad
.template Run<SrcData, DstData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
}

template <typename TData,
template <typename SrcData,
typename DstData,
address_space_t ThreadBufferAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
__device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const
{
mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
mThreadwiseStore
.template Run<SrcData, DstData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
}

template <typename TData,
template <typename SrcData,
typename DstData,
address_space_t BlockSrcAddressSpace = address_space_t::generic,
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void Run(const TData* p_block_src, TData* p_block_dst) const
__device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const
{
TData p_thread_buffer[GetThreadBufferSize()];
SrcData p_thread_buffer[GetThreadBufferSize()];

RunLoadThreadBuffer<TData, BlockSrcAddressSpace, address_space_t::generic>(p_block_src,
p_thread_buffer);
RunStoreThreadBuffer<TData, address_space_t::generic, BlockDstAddressSpace>(p_thread_buffer,
p_block_dst);
RunLoadThreadBuffer<SrcData, SrcData, BlockSrcAddressSpace, address_space_t::generic>(
p_block_src, p_thread_buffer);

// if there is type conversion, it's done during store
RunStoreThreadBuffer<SrcData, DstData, address_space_t::generic, BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
}

template <typename T, bool PositiveDirection>
Expand Down
Loading