From 2c4e5def5f85815f369214bcdbd42b82abe8aeec Mon Sep 17 00:00:00 2001 From: "Yan, Zhiwei" Date: Thu, 21 Dec 2023 06:39:38 +0000 Subject: [PATCH] Remove mkl related func in BlasImpl.h --- aten/src/ATen/native/mkldnn/xpu/BlasImpl.h | 627 --------------------- 1 file changed, 627 deletions(-) diff --git a/aten/src/ATen/native/mkldnn/xpu/BlasImpl.h b/aten/src/ATen/native/mkldnn/xpu/BlasImpl.h index 2bd3967886a5..099bbfb31c48 100644 --- a/aten/src/ATen/native/mkldnn/xpu/BlasImpl.h +++ b/aten/src/ATen/native/mkldnn/xpu/BlasImpl.h @@ -5,7 +5,6 @@ #include // #include -// #include #include // #include "comm/ATDispatch.h" @@ -13,9 +12,7 @@ #include -// using namespace dnnl; // using namespace xpu::dpcpp; -// using namespace xpu::oneDNN; namespace at { namespace xpu { @@ -40,455 +37,6 @@ static inline bool check_broadcast( return true; } -#ifdef USE_ONEMKL -template -static void gemm_batch( - sycl::queue& queue, - oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, - int64_t m, - int64_t n, - int64_t k, - scalar_t alpha, - scalar_t* a, - int64_t lda, - int64_t stride_a, - scalar_t* b, - int64_t ldb, - int64_t stride_b, - scalar_t beta, - scalar_t* c, - int64_t ldc, - int64_t stride_c, - int64_t batch_size) { - DPCPP_ONEMKL_SUBMIT( - queue, - oneapi::mkl::blas::column_major::gemm_batch, - queue, - transa, - transb, - m, - n, - k, - alpha, - a, - lda, - stride_a, - b, - ldb, - stride_b, - beta, - c, - ldc, - stride_c, - batch_size); -} - -template <> -void gemm_batch>( - sycl::queue& queue, - oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, - int64_t m, - int64_t n, - int64_t k, - c10::complex alpha, - c10::complex* a, - int64_t lda, - int64_t stride_a, - c10::complex* b, - int64_t ldb, - int64_t stride_b, - c10::complex beta, - c10::complex* c, - int64_t ldc, - int64_t stride_c, - int64_t batch_size) { - DPCPP_ONEMKL_SUBMIT( - queue, - oneapi::mkl::blas::column_major::gemm_batch, - queue, - transa, - transb, - m, - n, - k, - *reinterpret_cast*>(&alpha), - reinterpret_cast*>(a), - lda, - stride_a, - reinterpret_cast*>(b), - ldb, - stride_b, - *reinterpret_cast*>(&beta), - reinterpret_cast*>(c), - ldc, - stride_c, - batch_size); -} - -template <> -void gemm_batch>( - sycl::queue& queue, - oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, - int64_t m, - int64_t n, - int64_t k, - c10::complex alpha, - c10::complex* a, - int64_t lda, - int64_t stride_a, - c10::complex* b, - int64_t ldb, - int64_t stride_b, - c10::complex beta, - c10::complex* c, - int64_t ldc, - int64_t stride_c, - int64_t batch_size) { - DPCPP_ONEMKL_SUBMIT( - queue, - oneapi::mkl::blas::column_major::gemm_batch, - queue, - transa, - transb, - m, - n, - k, - *reinterpret_cast*>(&alpha), - reinterpret_cast*>(a), - lda, - stride_a, - reinterpret_cast*>(b), - ldb, - stride_b, - *reinterpret_cast*>(&beta), - reinterpret_cast*>(c), - ldc, - stride_c, - batch_size); -} -#endif - -static void mkl_baddbmm( - Tensor& result, - const Tensor& self, - Tensor batch1, - Tensor batch2, - const Scalar& beta, - const Scalar& alpha) { -#ifdef USE_ONEMKL - // colum major - TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); - TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); - - auto batch1_sizes = batch1.sizes(); - auto batch2_sizes = batch2.sizes(); - auto batch1_strides = batch1.strides(); - auto batch2_strides = batch2.strides(); - - TORCH_CHECK( - batch2_sizes[0] == batch1_sizes[0] && batch2_sizes[1] == batch1_sizes[2], - "Expected size for first two dimensions of batch2 tensor to be: [", - batch1_sizes[0], - ", ", - batch1_sizes[2], - "] but got: [", - batch2_sizes[0], - ", ", - batch2_sizes[1], - "]."); - - if (beta.toComplexDouble() != 0.0 && !self.is_same(result)) { - auto b_self = expand_size( - self, {batch1.size(0), batch1.size(1), batch2.size(2)}, "mkl_matmul"); - result.resize_as_(*b_self).copy_(*b_self); - } else { - // For mkl_baddbmm, have to convert it to contiguous format(only update meta - // data, and don't copy memory) for such kind of tensor below: E.g.: the - // tensor whose size is [10, 12, 50], and stride is [50, 500, 1], where - // oneMKL lib cannot handle this kind of stride. Because stridec from oneMKL - // strided style API means step size for each sample in the same batch. - // However, for mkl_matmul, the stridec is always c.numel(), because we only - // have 1 sample when we do addmm. - result.resize_( - {batch1.size(0), batch1.size(1), batch2.size(2)}, - at::MemoryFormat::Contiguous); - } - - const auto result_strides = result.strides(); - const auto result_sizes = result.sizes(); - - if (result.numel() == 0) { - return; - } else if (batch1_sizes[2] == 0) { - if (beta.to>() == 0.0) { - result.zero_(); - } - } - - bool transpose_c = false; - Tensor c; - - if ((result_strides[1] == 1) && - ((result_sizes[2] == 1) || - (result_strides[2] >= std::max(1, result_sizes[1])))) { - // colum major - transpose_c = false; - c = result.resolve_conj(); - } else if ( - (result_strides[2] == 1) && - (result_sizes[1] == 1 || - (result_strides[1] >= std::max(1, result_sizes[2])))) { - // row major - std::swap(batch1, batch2); - std::swap(batch1_sizes, batch2_sizes); - std::swap(batch1_strides, batch2_strides); - transpose_c = true; - c = result.resolve_conj(); - } else { - transpose_c = false; - c = result.resolve_conj().transpose(1, 2).contiguous().transpose_(1, 2); - } - - const int64_t m = result_sizes[transpose_c ? 2 : 1]; - const int64_t n = result_sizes[transpose_c ? 1 : 2]; - const int64_t k = batch1_sizes[transpose_c ? 1 : 2]; - - // Cast batch1 as matrix a - bool transpose_a = false; - Tensor a; - /* Need lda >= max(1, (transpose_a ? k : m)) */ - if (batch1_strides[transpose_c ? 2 : 1] == 1 && - batch1_strides[transpose_c ? 1 : 2] >= std::max(int64_t{1}, m)) { - transpose_a = false; - a = batch1.resolve_conj(); - } else if ( - batch1_strides[transpose_c ? 1 : 2] == 1 && - batch1_strides[transpose_c ? 2 : 1] >= std::max(int64_t{1}, k)) { - transpose_a = true; - a = batch1; - } else { - transpose_a = !transpose_c; - a = batch1.clone(at::MemoryFormat::Contiguous); - } - - // Cast batch2 as matrix b - bool transpose_b = false; - Tensor b; - /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */ - if (batch2_strides[transpose_c ? 2 : 1] == 1 && - batch2_strides[transpose_c ? 1 : 2] >= std::max(int64_t{1}, k)) { - transpose_b = false; - b = batch2.resolve_conj(); - } else if ( - batch2_strides[transpose_c ? 1 : 2] == 1 && - batch2_strides[transpose_c ? 2 : 1] >= std::max(int64_t{1}, n)) { - transpose_b = true; - b = batch2; - } else { - transpose_b = !transpose_c; - b = batch2.clone(at::MemoryFormat::Contiguous); - } - - const int64_t lda = a.strides()[(transpose_a == transpose_c) ? 2 : 1]; - const int64_t ldb = b.strides()[(transpose_b == transpose_c) ? 2 : 1]; - // for the corner case: result tensor with size [b, m, 1], stride [m, 1, 1] - // we cannot use stride to get its leading dimension, whose value should be m. - int64_t ldc; - if (c.strides()[1] == c.strides()[2] == 1) { - ldc = c.sizes()[transpose_c ? 2 : 1]; - } else { - ldc = c.strides()[transpose_c ? 1 : 2]; - } - - const int64_t stridea = a.strides()[0]; - const int64_t strideb = b.strides()[0]; - const int64_t stridec = c.strides()[0]; - int64_t num_batch = c.sizes()[0]; - - // Always ensure the conjugation for c is resolved since there's no way to - // specify c's conjugation in the gemm call - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj()); - - auto& dpcpp_queue = dpcppGetCurrentQueue(); - IPEX_DISPATCH_FLOATING_AND_COMPLEX_TYPES( - result.scalar_type(), "mkl_baddbmm", [&] { - gemm_batch( - dpcpp_queue, - transpose_a ? a.is_conj() ? oneapi::mkl::transpose::C - : oneapi::mkl::transpose::T - : oneapi::mkl::transpose::N, // nontrans = 0, trans = 1, - // conjtrans = 3, - transpose_b ? b.is_conj() ? oneapi::mkl::transpose::C - : oneapi::mkl::transpose::T - : oneapi::mkl::transpose::N, - m, - n, - k, - alpha.to(), - a.data_ptr(), - lda, - stridea, - b.data_ptr(), - ldb, - strideb, - beta.to(), - c.data_ptr(), - ldc, - stridec, - num_batch); - }); - - if (!result.is_same(c)) { - result.copy_(c); - } -#endif -} - -static void mkl_matmul( - Tensor& result, - const Tensor& self, - Tensor m1, - Tensor m2, - Scalar beta, - Scalar alpha) { -#ifdef USE_ONEMKL - auto m1_strides = m1.strides(); - auto m1_sizes = m1.sizes(); - auto m2_strides = m2.strides(); - auto m2_sizes = m2.sizes(); - - if (beta.toComplexDouble() != 0.0 && !self.is_same(result)) { - auto b_self = expand_size(self, {m1_sizes[0], m2_sizes[1]}, "mkl_matmul"); - result.resize_as_(*b_self).copy_(*b_self); - } else { - result.resize_({m1_sizes[0], m2_sizes[1]}); - } - - const auto result_strides = result.strides(); - const auto result_sizes = result.sizes(); - - if (result.numel() == 0) { - return; - } - - bool transpose_c = false; - Tensor c; - - // Cast result as matrix a - if (result_strides[0] == 1 && - (result_sizes[1] == 1 || - result_strides[1] >= std::max(int64_t{1}, result_sizes[0]))) { - transpose_c = false; - c = result.resolve_conj(); - } else if ( - result_strides[1] == 1 && - (result_sizes[0] == 1 || - result_strides[0] >= std::max(int64_t{1}, result_sizes[1]))) { - std::swap(m1, m2); - std::swap(m1_sizes, m2_sizes); - std::swap(m1_strides, m2_strides); - transpose_c = true; - c = result.resolve_conj(); - } else { - transpose_c = false; - // make c FORTRAN contiguous - c = result.resolve_conj().transpose(0, 1).contiguous().transpose_(0, 1); - } - - const int64_t m = result_sizes[transpose_c ? 1 : 0]; - const int64_t n = result_sizes[transpose_c ? 0 : 1]; - const int64_t k = m1_sizes[transpose_c ? 0 : 1]; - - // Cast m1 as matrix a - bool transpose_a = false; - Tensor a; - /* Need lda >= max(1, (transpose_a ? k : m)) */ - if (m1_strides[transpose_c ? 1 : 0] == 1 && - m1_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, m)) { - transpose_a = false; - a = m1.resolve_conj(); - } else if ( - m1_strides[transpose_c ? 0 : 1] == 1 && - m1_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, k)) { - transpose_a = true; - a = m1; - } else { - transpose_a = !transpose_c; - a = m1.clone(at::MemoryFormat::Contiguous); - } - - // Cast m2 as matrix b - bool transpose_b = false; - Tensor b; - /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */ - if (m2_strides[transpose_c ? 1 : 0] == 1 && - m2_strides[transpose_c ? 0 : 1] >= std::max(int64_t{1}, k)) { - transpose_b = false; - b = m2.resolve_conj(); - } else if ( - m2_strides[transpose_c ? 0 : 1] == 1 && - m2_strides[transpose_c ? 1 : 0] >= std::max(int64_t{1}, n)) { - transpose_b = true; - b = m2; - } else { - transpose_b = !transpose_c; - b = m2.clone(at::MemoryFormat::Contiguous); - } - - const int64_t lda = a.strides()[(transpose_a == transpose_c) ? 1 : 0]; - const int64_t ldb = b.strides()[(transpose_b == transpose_c) ? 1 : 0]; - // for the corner case: result tensor with size [m, 1], stride [1, 1] - // we cannot use stride to get its leading dimension, whose value should be m. - int64_t ldc; - if (1 == c.strides()[0] == c.strides()[1]) { - ldc = c.sizes()[transpose_c ? 1 : 0]; - } else { - ldc = c.strides()[transpose_c ? 0 : 1]; - } - - // Always ensure the conjugation for c is resolved since there's no way to - // specify c's conjugation in the gemm call - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj()); - - auto& dpcpp_queue = dpcppGetCurrentQueue(); - // use colum major - IPEX_DISPATCH_FLOATING_AND_COMPLEX_TYPES( - result.scalar_type(), "mkl_matmul", [&] { - gemm_batch( - dpcpp_queue, - transpose_a ? a.is_conj() ? oneapi::mkl::transpose::C - : oneapi::mkl::transpose::T - : oneapi::mkl::transpose::N, // nontrans = 0, trans = 1, - // conjtrans = 3, - transpose_b ? b.is_conj() ? oneapi::mkl::transpose::C - : oneapi::mkl::transpose::T - : oneapi::mkl::transpose::N, - m, - n, - k, - alpha.to(), - a.data_ptr(), - lda, - a.numel(), - b.data_ptr(), - ldb, - b.numel(), - beta.to(), - c.data_ptr(), - ldc, - c.numel(), - 1); - }); - - if (!c.is_same(result)) { - result.copy_(c); - } -#endif -} - /***** The helper function to get post binary(or sum) for onednn_matmul ***** In onednn, it supports: result = BinaryOP(alpha * (m1 @ m2 + bias), beta * binary). Since the inputs/outputs shapes of Matmul are complicated, @@ -972,181 +520,6 @@ static Tensor& matmul_fusion_variants( } } -// Matmul_fusion_variants for Meta backend(only query shape) -static Tensor& matmul_fusion_variants_meta( - Tensor& output, - const Tensor& tensor1, - const Tensor& tensor2, - bool trans, - xpu::onednn::Attr& attr, - bool& is_fused, - Tensor bias = at::Tensor()) { - const auto dim_tensor1 = tensor1.dim(); - const auto dim_tensor2 = tensor2.dim(); - // This is checked up here to simplify the logic below - // Note that the strings are just evaluated on failure, so almost always we - // just evaluate the condition and move on - TORCH_CHECK( - dim_tensor1 != 0 && dim_tensor2 != 0, - "both arguments to matmul need to be at least 1D, but they are ", - dim_tensor1, - "D and ", - dim_tensor2, - "D"); - - bool should_fold_tensor1 = should_fold(tensor1, dim_tensor2); - bool should_fold_tensor2 = should_fold(tensor2, dim_tensor1); - - if (dim_tensor1 == 1 && dim_tensor2 == 1) { - // case1: - // original size: [6] x [6] -> [] - is_fused = true; - output = output.defined() ? output.view({1, 1}) - : at::empty({1, 1}, tensor1.options()); - } else if (dim_tensor1 == 2 && dim_tensor2 == 1) { - // case2: - // original sizes: [4, 2] x [2] -> [4] - // onednn sizes: [4, 2] x [2, 1] -> [4, 1] - DimVector output_shape({tensor1.size(0)}); - DimVector result_shape({tensor1.size(0), 1}); - output = output.defined() ? output.view(result_shape) - : at::empty(result_shape, tensor1.options()); - Tensor t2 = tensor2.view({tensor2.size(0), 1}); - } else if (dim_tensor1 == 1 && dim_tensor2 == 2) { - // case3: - // original sizes: [2] x [2, 6] -> [6] - // onednn sizes: [1, 2] x [2, 6] -> [1, 6] - DimVector output_shape({tensor2.size(1)}); - if (!trans) - output_shape[0] = tensor2.size(0); - Tensor t1 = tensor1.unsqueeze(0); - DimVector result_shape({1, output_shape[0]}); - output = output.defined() ? output.view(result_shape) - : at::empty(result_shape, tensor1.options()); - } else if (dim_tensor1 == 2 && dim_tensor2 == 2) { - // case4: - // original sizes: [4, 2] x [2, 6] -> [4, 6] - // onednn sizes: [4, 2] x [2, 6] -> [4, 6] - DimVector output_shape({tensor1.size(0), tensor2.size(1)}); - if (!trans) - output_shape[1] = tensor2.size(0); - - output = - output.defined() ? output : at::empty(output_shape, tensor1.options()); - - } else if (should_fold_tensor1) { - // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) - // case5-1: - // original sizes: [3, 4, 2] x [2, 6] -> [3, 4, 6] - // onednn sizes: [12, 2] x [2, 6] -> [12, 6] - // case5-2: - // original sizes: [3, 4, 2] x [2] -> [3, 4] - // onednn sizes: [12, 2] x [2, 1] -> [12, 1] - const auto t1_own = MaybeOwned::borrowed(tensor1); - const auto t2_own = MaybeOwned::borrowed(tensor2); - - const auto sizes_1 = t1_own->sizes(); - auto output_shape = DimVector(sizes_1.begin(), sizes_1.end() - 1); - const auto folded_dim1 = c10::multiply_integers(output_shape); - const auto t1 = t1_own->reshape({folded_dim1, sizes_1.back()}); - const auto t2_is_matrix = t2_own->dim() == 2; - Tensor t2 = t2_is_matrix ? *t2_own : t2_own->view({t2_own->size(0), 1}); - if (trans) - output_shape.push_back(t2.size(1)); - else - output_shape.push_back(t2.size(0)); - DimVector result_shape({t1.size(0), output_shape[output_shape.size() - 1]}); - output = output.defined() ? output.view(result_shape) - : at::empty(result_shape, tensor1.options()); - } else if (should_fold_tensor2) { - // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2) - // case6-1: - // original sizes: [2] x [3, 2, 4] = [3, 4] - // onednn sizes: [12, 2] x [2, 1] = [12, 1] - // or - // original sizes: [2] x [2, 3, 2, 4] = [2, 3, 4] - // onednn sizes: [24, 2] x [2, 1] = [24, 1] - - // case6-2: - // original sizes: [6, 2] x [3, 2, 4] = [3, 6, 4] - // onednn sizes: [12, 2] x [2, 6] = [12, 6] - // or - // original sizes: [6, 2] x [2, 3, 2, 4] = [2, 3, 6, 4] - // onednn sizes: [24, 2] x [2, 6] = [24, 6] - - const auto t1_own = trans - ? MaybeOwned::owned(tensor2.mT()) - : MaybeOwned::owned(tensor2.transpose(-1, -2).mT()); - trans = true; - const auto t2_own = dim_tensor1 == 2 - ? MaybeOwned::owned(tensor1.t()) - : MaybeOwned::borrowed(tensor1); - - const auto sizes_1 = t1_own->sizes(); - auto output_shape = DimVector(sizes_1.begin(), sizes_1.end() - 1); - const auto folded_dim1 = c10::multiply_integers(output_shape); - const auto t1 = t1_own->reshape({folded_dim1, sizes_1.back()}); - const auto t2_is_matrix = t2_own->dim() == 2; - Tensor t2 = t2_is_matrix ? *t2_own : t2_own->view({t2_own->size(0), 1}); - output_shape.push_back(t2.size(1)); - DimVector result_shape({t1.size(0), t2.size(1)}); - output = output.defined() ? output.view(result_shape) - : at::empty(result_shape, tensor1.options()); - - } else { - // dim_tensor1 >= 3 || dim_tensor2 >= 3 - // case7-1: - // original sizes: [3, 4, 2] x [3, 2, 6] = [3, 4, 6] - // onednn sizes: [3, 4, 2] x [3, 2, 6] = [3, 4, 6] - // case7-2: - // original sizes: [5, 1, 4, 2] x [3, 2, 6] = [5, 3, 4, 6] - // onednn sizes: [15, 4, 2] x [15, 2, 6] = [15, 4, 6] - const auto t2_own = trans - ? MaybeOwned::borrowed(tensor2) - : MaybeOwned::owned(tensor2.transpose(-1, -2)); - trans = true; - - const int64_t n = dim_tensor1 > 1 ? tensor1.sizes().cend()[-2] : 1LL; - const int64_t m1 = tensor1.sizes().back(); - const IntArrayRef batch_tensor1( - tensor1.sizes().data(), std::max(dim_tensor1 - 2, 0LL)); - const int64_t m2 = - dim_tensor2 > 1 ? t2_own->sizes().cend()[-2] : t2_own->sizes().back(); - const int64_t p = dim_tensor2 > 1 ? t2_own->sizes().back() : 1LL; - const IntArrayRef batch_tensor2( - t2_own->sizes().data(), std::max(dim_tensor2 - 2, 0LL)); - auto output_shape = infer_size_dimvector(batch_tensor1, batch_tensor2); - - const auto tensor1_expand_size = [&output_shape, n, m1] { - DimVector ret(output_shape); - ret.append({n, m1}); - return ret; - }(); - const auto tensor2_expand_size = [&output_shape, m2, p] { - DimVector ret(output_shape); - ret.append({m2, p}); - return ret; - }(); - const int64_t expand_batch_product = c10::multiply_integers(output_shape); - - // flatten expanded batches - const auto tensor1_expanded = tensor1.expand(tensor1_expand_size) - .reshape({expand_batch_product, n, m1}); - const auto tensor2_expanded = t2_own->expand(tensor2_expand_size) - .reshape({expand_batch_product, m2, p}); - if (dim_tensor1 > 1) { - output_shape.push_back(n); - } - if (dim_tensor2 > 1) { - output_shape.push_back(p); - } - DimVector result_shape({expand_batch_product, n, p}); - output = output.defined() ? output.view(result_shape) - : at::empty(result_shape, tensor1.options()); - } - return output; -} - } // namespace impl } // namespace xpu