src/model/operation/pooling.cc

/*********************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 ************************************************************/
#include "pooling.h"

#include <cmath>

namespace singa {

PoolingHandle::PoolingHandle(const Tensor &input,
                             const std::vector<int> &kernel_size,
                             const std::vector<int> &stride,
                             const std::vector<int> &padding,
                             const bool is_max) {
  kernel_h = kernel_size[0];
  kernel_w = kernel_size[1];

  pad_h = padding[0];
  pad_w = padding[1];

  stride_h = stride[0];
  stride_w = stride[1];

  batchsize = input.shape(0);
  channels = input.shape(1);
  height = input.shape(2);
  width = input.shape(3);

  pooled_height = 1;

  if (stride_h > 0)
    pooled_height =
        std::floor(((height + 2 * pad_h - kernel_h) / stride_h)) + 1;
  pooled_width = std::floor(((width + 2 * pad_w - kernel_w) / stride_w)) + 1;
  is_max_pooling = is_max;

#ifdef USE_DNNL
  if (input.device()->lang() == kCpp) {
    auto x_dims =
        dnnl::memory::dims(input.shape().begin(), input.shape().end());
    auto y_dims =
        dnnl::memory::dims({batchsize, channels, pooled_height, pooled_width});
    auto s_dims = dnnl::memory::dims(stride.begin(), stride.end());
    auto k_dims = dnnl::memory::dims(kernel_size.begin(), kernel_size.end());

    auto p_dims = dnnl::memory::dims(padding.begin(), padding.end());

    auto dtype_ = dnnl::memory::data_type::f32;
    auto format_tag_ = get_dnnl_format_tag(input);
    x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
    y_md = dnnl::memory::desc({y_dims}, dtype_, format_tag_);

    // allow max or avg (follow cudnn implementation convention)
    auto pooling_algo = dnnl::algorithm::pooling_avg_exclude_padding;
    if (is_max_pooling) pooling_algo = dnnl::algorithm::pooling_max;

    auto pool_fwd_d = dnnl::pooling_forward::desc(
        dnnl::prop_kind::forward_training, pooling_algo, x_md, y_md, s_dims,
        k_dims, p_dims, p_dims);
    auto pool_bwd_d = dnnl::pooling_backward::desc(
        pooling_algo, x_md, y_md, s_dims, k_dims, p_dims, p_dims);

    auto eng = input.device()->context(0)->dnnl_engine;
    pool_fwd_pd = dnnl::pooling_forward::primitive_desc(pool_fwd_d, eng);
    pool_bwd_pd =
        dnnl::pooling_backward::primitive_desc(pool_bwd_d, eng, pool_fwd_pd);

    auto ws_md = pool_fwd_pd.workspace_desc();
    ws_mem = dnnl::memory(ws_md, eng);
  }
#endif  // USE_DNNL
}

PoolingHandle::~PoolingHandle() {}

#ifdef USE_DNNL

Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x) {
  CHECK_EQ(x.device()->lang(), kCpp);
  Tensor y({(unsigned long)ph.batchsize, (unsigned long)ph.channels,
            (unsigned long)ph.pooled_height, (unsigned long)ph.pooled_width},
           x.device(), x.data_type());

  y.device()->Exec(
      [y, x, &ph](Context *ctx) mutable {
        auto eng = ctx->dnnl_engine;
        using namespace dnnl;

        memory x_mem(ph.x_md, eng, x.block()->mutable_data());
        memory y_mem(ph.y_md, eng, y.block()->mutable_data());

        pooling_forward(ph.pool_fwd_pd)
            .execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
                                        {DNNL_ARG_DST, y_mem},
                                        {DNNL_ARG_WORKSPACE, ph.ws_mem}});
        ctx->dnnl_stream.wait();
      },
      {x.block()}, {y.block()}, "CpuPoolingForward");

  return y;
}

Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &grad,
                          const Tensor &x, const Tensor &y) {
  CHECK_EQ(x.device()->lang(), kCpp);
  CHECK_EQ(grad.device()->lang(), kCpp);
  CHECK_EQ(y.device()->lang(), kCpp);
  Tensor in_grad;
  in_grad.ResetLike(x);

  in_grad.device()->Exec(
      [x, y, in_grad, grad, &ph](Context *ctx) mutable {
        auto eng = ctx->dnnl_engine;
        using namespace dnnl;

        memory dx_mem(ph.x_md, eng, in_grad.block()->mutable_data());
        memory dy_mem(ph.y_md, eng, grad.block()->mutable_data());

        pooling_backward(ph.pool_bwd_pd)
            .execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_DST, dy_mem},
                                        {DNNL_ARG_DIFF_SRC, dx_mem},
                                        {DNNL_ARG_WORKSPACE, ph.ws_mem}});
        ctx->dnnl_stream.wait();
      },
      {x.block(), y.block(), grad.block()}, {in_grad.block()}, "CpuPoolingBackward");

  return in_grad;
}
#endif  // USE_DNNL

#ifdef USE_CUDNN

CudnnPoolingHandle::CudnnPoolingHandle(const Tensor &input,
                                       const std::vector<int> &kernel_size,
                                       const std::vector<int> &stride,
                                       const std::vector<int> &padding,
                                       const bool is_max)
    : PoolingHandle(input, kernel_size, stride, padding, is_max) {
  // nan_prop = CUDNN_NOT_PROPAGATE_NAN;

  DataType dtype = input.data_type();

  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc));
  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc));
  CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc));

  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW,
                                         GetCudnnDataType(dtype), batchsize,
                                         channels, height, width));
  // LOG(ERROR) << batchsize << " " << channels << " " << pooled_height << " "
  // << pooled_width;
  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
      y_desc, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, channels,
      pooled_height, pooled_width));
  auto pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
  if (is_max) pool_method = CUDNN_POOLING_MAX;

  CUDNN_CHECK(cudnnSetPooling2dDescriptor(pool_desc, pool_method, nan_prop,
                                          kernel_h, kernel_w, pad_h, pad_w,
                                          stride_h, stride_w));
};

CudnnPoolingHandle::~CudnnPoolingHandle() {
  if (pool_desc != nullptr)
    CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc));
  if (x_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc));
  if (y_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc));
}

Tensor GpuPoolingForward(const CudnnPoolingHandle &cph, const Tensor &x) {
  CHECK_EQ(x.device()->lang(), kCuda);
  CHECK_EQ(x.nDim(), 4u);

  Tensor output = Tensor(
      Shape({cph.batchsize, cph.channels, cph.pooled_height, cph.pooled_width}),
      x.device(), x.data_type());

  output.device()->Exec(
      [output, x, &cph](Context *ctx) mutable {
        float alpha = 1.0f, beta = 0.0f;
        cudnnPoolingForward(ctx->cudnn_handle, cph.pool_desc, &alpha,
                            cph.x_desc, x.block()->data(), &beta, cph.y_desc,
                            output.block()->mutable_data());
      },
      {x.block()}, {output.block()}, "GpuPoolingForward");

  return output;
}

Tensor GpuPoolingBackward(const CudnnPoolingHandle &cph, const Tensor &dy,
                          const Tensor &x, const Tensor &y) {
  CHECK_EQ(dy.device()->lang(), kCuda);
  CHECK_EQ(dy.nDim(), 4u);

  Tensor dx;
  dx.ResetLike(x);

  dx.device()->Exec(
      [dx, dy, x, y, &cph](Context *ctx) mutable {
        float alpha = 1.0f, beta = 0.0f;
        cudnnPoolingBackward(ctx->cudnn_handle, cph.pool_desc, &alpha,
                             cph.y_desc, y.block()->data(), cph.y_desc,
                             dy.block()->data(), cph.x_desc, x.block()->data(),
                             &beta, cph.x_desc, dx.block()->mutable_data());
      },
      {dy.block(), y.block(), x.block()}, {dx.block()}, "GpuPoolingBackward");

  return dx;
};
#endif  // USE_CUDNN

}  // namespace singa