Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,17 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm
${PROJECT_SOURCE_DIR}/external/include
${PROJECT_SOURCE_DIR}/driver/include
${PROJECT_BINARY_DIR}/composable_kernel/include/utility
)

if(DEVICE_BACKEND STREQUAL "AMD")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
endif()

add_subdirectory(driver)
14 changes: 14 additions & 0 deletions composable_kernel/include/kernel_algorithm/convolution_common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef CK_CONVOLUTION_COMMON_HPP
#define CK_CONVOLUTION_COMMON_HPP

namespace ck {

enum ConvolutionDirection
{
Forward,
BackwardData,
BackwardWeight
};

} // namespace ck
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "threadwise_tensor_slice_copy.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_2d_tensor_op.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_3d_tensor_op.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_HPP

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
Expand Down Expand Up @@ -125,38 +125,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn

// blockwise copy
// input: format is [C, Hi, Wi, N]
auto blockwise_in_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
decltype(in_c_h_w_n_global_desc),
decltype(in_c_h_w_n_block_desc),
decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopySubLengths_CHWN,
InBlockCopyClusterLengths_CHWN,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
3,
3,
InBlockCopyDataPerAccess_N,
InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
{0, 0, 0, 0});
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
BlockSize,
decltype(in_c_h_w_n_global_desc),
decltype(in_c_h_w_n_block_desc),
decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopySubLengths_CHWN,
InBlockCopyClusterLengths_CHWN,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
Sequence<0, 1, 2, 3>,
3,
3,
InBlockCopyDataPerAccess_N,
InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});

// blockwise wei copy
// format is [CPerBlock, X * KPerBlock]
const auto blockwise_wei_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_CK,
WeiBlockCopyClusterLengths_CK,
Sequence<0, 1>,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_CK,
WeiBlockCopyClusterLengths_CK,
Sequence<0, 1>,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>({0, 0},
{0, 0});

// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
Expand Down Expand Up @@ -318,14 +318,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
n_block_data_begin + n_thread_data_begin);

#if 1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
make_zero_array<index_t, 10>())
.Run(p_out_thread, p_out_thread_on_global);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
Expand Down Expand Up @@ -388,14 +389,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
n_block_data_begin + n_thread_data_begin);

#if 1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
make_zero_array<index_t, 10>())
.Run(p_out_thread, p_out_thread_on_global);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER_HPP

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
Expand Down Expand Up @@ -127,9 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
// input: format is [C, Hi, Wi, N]
auto blockwise_in_copy =
#if 0
BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1_deprecated
#else
BlockwiseGenericTensorSliceCopy_v2
BlockwiseGenericTensorSliceCopy_v2_deprecated
#endif
<BlockSize,
decltype(in_c_h_w_n_global_desc),
Expand All @@ -149,9 +149,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
// format is [CPerBlock, X * KPerBlock]
const auto blockwise_wei_copy =
#if 0
BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1_deprecated
#else
BlockwiseGenericTensorSliceCopy_v2
BlockwiseGenericTensorSliceCopy_v2_deprecated
#endif
<BlockSize,
decltype(wei_c_k_global_desc),
Expand Down Expand Up @@ -406,14 +406,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
n_block_data_begin + n_thread_data_begin);

#if 1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
make_zero_array<index_t, 10>())
.Run(p_out_thread, p_out_thread_on_global);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
Expand Down Expand Up @@ -476,14 +477,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
n_block_data_begin + n_thread_data_begin);

#if 1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
decltype(out_10d_thread_desc),
decltype(out_10d_global_desc),
decltype(out_10d_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 10, 1>::type,
9,
OutThreadCopyDataPerAccess_N,
OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
make_zero_array<index_t, 10>())
.Run(p_out_thread, p_out_thread_on_global);
#elif 0
ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_PADDED_HPP

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_tensor_slice_copy.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_tensor_slice_copy.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_2d_tensor_op.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_2d_tensor_op.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW

#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
#include "ConstantMergedTensorDescriptor_deprecated.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
Expand Down Expand Up @@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
BlockSize,
Float,
decltype(in_c_n1_b_n2_global_merged_desc),
Expand All @@ -155,20 +155,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
auto blockwise_wei_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>(
{0, k_block_data_on_global}, {0, 0});
auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
BlockSize,
Float,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0});

// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
Expand Down
Loading